@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Gemini Live Voice API Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of bidirectional voice communication using Gemini's Live API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/GeminiLive
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { RealtimeError } from "../errors.js";
|
|
10
|
+
import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
|
|
11
|
+
/**
|
|
12
|
+
* Google Gemini Live Voice API Handler
|
|
13
|
+
*
|
|
14
|
+
* Implements bidirectional voice communication with Gemini's Live API.
|
|
15
|
+
*
|
|
16
|
+
* @see https://ai.google.dev/gemini-api/docs/live
|
|
17
|
+
*/
|
|
18
|
+
export class GeminiLive extends BaseRealtimeHandler {
|
|
19
|
+
name = "gemini-live";
|
|
20
|
+
apiKey;
|
|
21
|
+
ws = null;
|
|
22
|
+
audioChunkIndex = 0;
|
|
23
|
+
pendingFunctionCalls = new Map();
|
|
24
|
+
constructor(apiKey) {
|
|
25
|
+
super();
|
|
26
|
+
// Accept GOOGLE_AI_API_KEY / GEMINI_API_KEY as aliases — `.env.example`
|
|
27
|
+
// documents those as the canonical Google credentials, so insisting on
|
|
28
|
+
// GOOGLE_API_KEY here was a setup footgun (Copilot review).
|
|
29
|
+
const resolvedKey = (apiKey ??
|
|
30
|
+
process.env.GOOGLE_API_KEY ??
|
|
31
|
+
process.env.GOOGLE_AI_API_KEY ??
|
|
32
|
+
process.env.GEMINI_API_KEY ??
|
|
33
|
+
"").trim();
|
|
34
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
35
|
+
}
|
|
36
|
+
isConfigured() {
|
|
37
|
+
return this.apiKey !== null;
|
|
38
|
+
}
|
|
39
|
+
getSupportedFormats() {
|
|
40
|
+
return ["opus", "wav"];
|
|
41
|
+
}
|
|
42
|
+
async connect(config) {
|
|
43
|
+
if (!this.apiKey) {
|
|
44
|
+
throw RealtimeError.providerNotConfigured("gemini-live");
|
|
45
|
+
}
|
|
46
|
+
if (this.isConnected()) {
|
|
47
|
+
throw RealtimeError.sessionAlreadyActive("gemini-live");
|
|
48
|
+
}
|
|
49
|
+
this.emitStateChange("connecting");
|
|
50
|
+
try {
|
|
51
|
+
// Import WebSocket
|
|
52
|
+
const { default: WebSocket } = await import("ws");
|
|
53
|
+
// Determine model
|
|
54
|
+
const model = config.model ?? "gemini-2.5-flash-native-audio-preview-09-2025";
|
|
55
|
+
// Connect to Gemini Live API
|
|
56
|
+
const wsUrl = `wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key=${this.apiKey}`;
|
|
57
|
+
this.ws = new WebSocket(wsUrl);
|
|
58
|
+
// Issue 9: capture a local reference so the closure below doesn't need
|
|
59
|
+
// a non-null assertion on `this.ws`. The local `ws` survives even if a
|
|
60
|
+
// timeout nulls `this.ws` — that's intentional, the closure should
|
|
61
|
+
// still be able to detach its own listeners on the same socket.
|
|
62
|
+
const ws = this.ws;
|
|
63
|
+
// Wait for connection
|
|
64
|
+
await new Promise((resolve, reject) => {
|
|
65
|
+
const openHandler = () => {
|
|
66
|
+
clearTimeout(timeout);
|
|
67
|
+
ws.off("error", errorHandler);
|
|
68
|
+
resolve();
|
|
69
|
+
};
|
|
70
|
+
const errorHandler = (err) => {
|
|
71
|
+
clearTimeout(timeout);
|
|
72
|
+
ws.off("open", openHandler);
|
|
73
|
+
reject(err);
|
|
74
|
+
};
|
|
75
|
+
const timeout = setTimeout(() => {
|
|
76
|
+
// C1: close the half-opened socket and detach temp listeners so we
|
|
77
|
+
// don't leak the WebSocket or its closures on connection timeout.
|
|
78
|
+
// NEW7: removing the temp listeners also prevents accumulation
|
|
79
|
+
// across reconnect attempts (they'd otherwise hang forever and
|
|
80
|
+
// silently call reject() on a settled promise).
|
|
81
|
+
ws.off("open", openHandler);
|
|
82
|
+
ws.off("error", errorHandler);
|
|
83
|
+
ws.terminate();
|
|
84
|
+
this.ws = null;
|
|
85
|
+
reject(new Error("Connection timeout"));
|
|
86
|
+
}, config.timeout ?? 30000);
|
|
87
|
+
ws.on("open", openHandler);
|
|
88
|
+
ws.on("error", errorHandler);
|
|
89
|
+
});
|
|
90
|
+
this.ws.on("close", () => {
|
|
91
|
+
this.emitStateChange("disconnected");
|
|
92
|
+
this.session = null;
|
|
93
|
+
});
|
|
94
|
+
this.ws.on("error", (err) => {
|
|
95
|
+
this.emitError(err);
|
|
96
|
+
});
|
|
97
|
+
// Send setup message
|
|
98
|
+
await this.sendSetup(config, model);
|
|
99
|
+
// Wait for setup complete BEFORE attaching the permanent message handler,
|
|
100
|
+
// otherwise early audio/text data arriving during setup race window is
|
|
101
|
+
// dispatched to handleMessage before consumers register their handlers.
|
|
102
|
+
await this.waitForSetupComplete();
|
|
103
|
+
// Set up message handler — only after setup complete.
|
|
104
|
+
this.ws.on("message", (data) => {
|
|
105
|
+
this.handleMessage(data);
|
|
106
|
+
});
|
|
107
|
+
// Generate session ID
|
|
108
|
+
const sessionId = `gemini-${Date.now()}`;
|
|
109
|
+
// Create session object
|
|
110
|
+
this.session = this.createSession(sessionId, config);
|
|
111
|
+
this.emitStateChange("connected");
|
|
112
|
+
logger.info(`[GeminiLiveHandler] Connected to session: ${sessionId}`);
|
|
113
|
+
return this.session;
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
this.emitStateChange("error");
|
|
117
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
118
|
+
throw RealtimeError.connectionFailed(errorMessage, "gemini-live", err instanceof Error ? err : undefined);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
async disconnect() {
|
|
122
|
+
if (!this.ws) {
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
this.emitStateChange("disconnecting");
|
|
126
|
+
try {
|
|
127
|
+
this.ws.close();
|
|
128
|
+
this.ws = null;
|
|
129
|
+
this.session = null;
|
|
130
|
+
this.audioChunkIndex = 0;
|
|
131
|
+
this.pendingFunctionCalls.clear();
|
|
132
|
+
this.emitStateChange("disconnected");
|
|
133
|
+
logger.info("[GeminiLiveHandler] Disconnected");
|
|
134
|
+
}
|
|
135
|
+
catch (err) {
|
|
136
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
137
|
+
throw RealtimeError.protocolError(`Disconnect failed: ${errorMessage}`, "gemini-live", err instanceof Error ? err : undefined);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
async sendAudio(audio) {
|
|
141
|
+
if (!this.ws || !this.isConnected()) {
|
|
142
|
+
throw RealtimeError.sessionNotActive("gemini-live");
|
|
143
|
+
}
|
|
144
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : audio.data;
|
|
145
|
+
// Send audio as realtime input
|
|
146
|
+
const message = {
|
|
147
|
+
realtimeInput: {
|
|
148
|
+
mediaChunks: [
|
|
149
|
+
{
|
|
150
|
+
mimeType: "audio/pcm;rate=16000",
|
|
151
|
+
data: audioBuffer.toString("base64"),
|
|
152
|
+
},
|
|
153
|
+
],
|
|
154
|
+
},
|
|
155
|
+
};
|
|
156
|
+
this.ws.send(JSON.stringify(message));
|
|
157
|
+
}
|
|
158
|
+
async sendText(text) {
|
|
159
|
+
if (!this.ws || !this.isConnected()) {
|
|
160
|
+
throw RealtimeError.sessionNotActive("gemini-live");
|
|
161
|
+
}
|
|
162
|
+
// Send text as client content
|
|
163
|
+
const message = {
|
|
164
|
+
clientContent: {
|
|
165
|
+
turns: [
|
|
166
|
+
{
|
|
167
|
+
role: "user",
|
|
168
|
+
parts: [{ text }],
|
|
169
|
+
},
|
|
170
|
+
],
|
|
171
|
+
turnComplete: true,
|
|
172
|
+
},
|
|
173
|
+
};
|
|
174
|
+
this.ws.send(JSON.stringify(message));
|
|
175
|
+
}
|
|
176
|
+
async triggerResponse() {
|
|
177
|
+
// Gemini automatically generates responses based on VAD
|
|
178
|
+
// This is a no-op for Gemini Live
|
|
179
|
+
}
|
|
180
|
+
async cancelResponse() {
|
|
181
|
+
// Gemini doesn't have explicit cancel, but we can send empty content
|
|
182
|
+
// to interrupt
|
|
183
|
+
if (this.ws && this.isConnected()) {
|
|
184
|
+
const message = {
|
|
185
|
+
clientContent: {
|
|
186
|
+
turns: [],
|
|
187
|
+
turnComplete: true,
|
|
188
|
+
},
|
|
189
|
+
};
|
|
190
|
+
this.ws.send(JSON.stringify(message));
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Send setup message with configuration
|
|
195
|
+
*/
|
|
196
|
+
async sendSetup(config, model) {
|
|
197
|
+
if (!this.ws) {
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
// Issue 9: build the inner `setup` object as a named local so the
|
|
201
|
+
// optional fields below can be assigned without non-null assertions on
|
|
202
|
+
// `setupMessage.setup`.
|
|
203
|
+
const setup = {
|
|
204
|
+
model: `models/${model}`,
|
|
205
|
+
generationConfig: {
|
|
206
|
+
responseModalities: ["AUDIO", "TEXT"],
|
|
207
|
+
speechConfig: {
|
|
208
|
+
voiceConfig: {
|
|
209
|
+
prebuiltVoiceConfig: {
|
|
210
|
+
voiceName: config.voice ?? "Puck",
|
|
211
|
+
},
|
|
212
|
+
},
|
|
213
|
+
},
|
|
214
|
+
},
|
|
215
|
+
};
|
|
216
|
+
// Add system instruction
|
|
217
|
+
if (config.systemPrompt) {
|
|
218
|
+
setup.systemInstruction = {
|
|
219
|
+
parts: [{ text: config.systemPrompt }],
|
|
220
|
+
};
|
|
221
|
+
}
|
|
222
|
+
// Add tools
|
|
223
|
+
if (config.tools && config.tools.length > 0) {
|
|
224
|
+
setup.tools = [
|
|
225
|
+
{
|
|
226
|
+
functionDeclarations: config.tools.map((tool) => ({
|
|
227
|
+
name: tool.name,
|
|
228
|
+
description: tool.description,
|
|
229
|
+
parameters: tool.parameters,
|
|
230
|
+
})),
|
|
231
|
+
},
|
|
232
|
+
];
|
|
233
|
+
}
|
|
234
|
+
const setupMessage = { setup };
|
|
235
|
+
this.ws.send(JSON.stringify(setupMessage));
|
|
236
|
+
}
|
|
237
|
+
/**
|
|
238
|
+
* Wait for setup complete message
|
|
239
|
+
*/
|
|
240
|
+
waitForSetupComplete() {
|
|
241
|
+
return new Promise((resolve, reject) => {
|
|
242
|
+
const handler = (data) => {
|
|
243
|
+
try {
|
|
244
|
+
const response = JSON.parse(data.toString());
|
|
245
|
+
if (response.setupComplete) {
|
|
246
|
+
clearTimeout(timeout);
|
|
247
|
+
this.ws?.off("message", handler);
|
|
248
|
+
resolve();
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
catch {
|
|
252
|
+
// Ignore parse errors
|
|
253
|
+
}
|
|
254
|
+
};
|
|
255
|
+
const timeout = setTimeout(() => {
|
|
256
|
+
// M2: detach the message handler before rejecting so future Gemini
|
|
257
|
+
// messages don't invoke a dangling handler for the connection lifetime.
|
|
258
|
+
this.ws?.off("message", handler);
|
|
259
|
+
reject(new Error("Timeout waiting for setup complete"));
|
|
260
|
+
}, 10000);
|
|
261
|
+
this.ws?.on("message", handler);
|
|
262
|
+
});
|
|
263
|
+
}
|
|
264
|
+
/**
|
|
265
|
+
* Handle incoming WebSocket messages
|
|
266
|
+
*/
|
|
267
|
+
handleMessage(data) {
|
|
268
|
+
try {
|
|
269
|
+
const response = JSON.parse(data.toString());
|
|
270
|
+
if (response.serverContent) {
|
|
271
|
+
const content = response.serverContent;
|
|
272
|
+
// Handle model turn
|
|
273
|
+
if (content.modelTurn?.parts) {
|
|
274
|
+
for (const part of content.modelTurn.parts) {
|
|
275
|
+
// Handle text
|
|
276
|
+
if (part.text) {
|
|
277
|
+
this.emitText(part.text, content.turnComplete ?? false);
|
|
278
|
+
}
|
|
279
|
+
// Handle audio
|
|
280
|
+
if (part.inlineData) {
|
|
281
|
+
const audioData = Buffer.from(part.inlineData.data, "base64");
|
|
282
|
+
this.emitAudio({
|
|
283
|
+
data: audioData,
|
|
284
|
+
index: this.audioChunkIndex++,
|
|
285
|
+
isFinal: content.turnComplete ?? false,
|
|
286
|
+
format: this.parseAudioFormat(part.inlineData.mimeType),
|
|
287
|
+
sampleRate: 24000,
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
// Handle turn complete
|
|
293
|
+
if (content.turnComplete) {
|
|
294
|
+
this.emitTurnEnd();
|
|
295
|
+
this.audioChunkIndex = 0;
|
|
296
|
+
}
|
|
297
|
+
// Handle interruption
|
|
298
|
+
if (content.interrupted) {
|
|
299
|
+
this.emitTurnEnd();
|
|
300
|
+
this.audioChunkIndex = 0;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
// Handle tool calls
|
|
304
|
+
if (response.toolCall?.functionCalls) {
|
|
305
|
+
for (const call of response.toolCall.functionCalls) {
|
|
306
|
+
this.pendingFunctionCalls.set(call.id, call.name);
|
|
307
|
+
this.handleFunctionCall(call.id, call.name, call.args);
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
// Handle tool call cancellation
|
|
311
|
+
if (response.toolCallCancellation?.ids) {
|
|
312
|
+
for (const id of response.toolCallCancellation.ids) {
|
|
313
|
+
this.pendingFunctionCalls.delete(id);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
catch (err) {
|
|
318
|
+
logger.warn(`[GeminiLiveHandler] Failed to parse message: ${err instanceof Error ? err.message : String(err)}`);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* Parse audio format from MIME type
|
|
323
|
+
*/
|
|
324
|
+
parseAudioFormat(mimeType) {
|
|
325
|
+
if (mimeType.includes("opus")) {
|
|
326
|
+
return "opus";
|
|
327
|
+
}
|
|
328
|
+
if (mimeType.includes("wav") || mimeType.includes("pcm")) {
|
|
329
|
+
return "wav";
|
|
330
|
+
}
|
|
331
|
+
if (mimeType.includes("mp3") || mimeType.includes("mpeg")) {
|
|
332
|
+
return "mp3";
|
|
333
|
+
}
|
|
334
|
+
return "opus";
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Handle function call from model
|
|
338
|
+
*/
|
|
339
|
+
async handleFunctionCall(callId, name, args) {
|
|
340
|
+
try {
|
|
341
|
+
const result = await this.emitFunctionCall(name, args);
|
|
342
|
+
// Send function response
|
|
343
|
+
if (this.ws && this.isConnected()) {
|
|
344
|
+
const responseMessage = {
|
|
345
|
+
toolResponse: {
|
|
346
|
+
functionResponses: [
|
|
347
|
+
{
|
|
348
|
+
id: callId,
|
|
349
|
+
name,
|
|
350
|
+
response: { result },
|
|
351
|
+
},
|
|
352
|
+
],
|
|
353
|
+
},
|
|
354
|
+
};
|
|
355
|
+
this.ws.send(JSON.stringify(responseMessage));
|
|
356
|
+
this.pendingFunctionCalls.delete(callId);
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
catch (err) {
|
|
360
|
+
const error = err instanceof Error
|
|
361
|
+
? err
|
|
362
|
+
: new Error(String(err || "Function call failed"));
|
|
363
|
+
logger.error(`[GeminiLiveHandler] Function call failed: ${error.message}`);
|
|
364
|
+
this.emitError(error);
|
|
365
|
+
// Clean up the pending entry on the error path too — the success
|
|
366
|
+
// branch deletes at line ~439, but without this delete the Map grows
|
|
367
|
+
// unbounded over a long session with intermittently-failing tools.
|
|
368
|
+
this.pendingFunctionCalls.delete(callId);
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Cloud Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using Google Cloud Speech-to-Text API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/GoogleSTT
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSAudioFormat, STTHandler, STTLanguage, STTOptions, STTResult, TranscriptionSegment } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* Google Cloud Speech-to-Text Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports transcription with speaker diarization, word timestamps, and punctuation.
|
|
13
|
+
*
|
|
14
|
+
* @see https://cloud.google.com/speech-to-text/docs
|
|
15
|
+
*/
|
|
16
|
+
export declare class GoogleSTT implements STTHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly credentialsPath;
|
|
19
|
+
private readonly baseUrl;
|
|
20
|
+
/**
|
|
21
|
+
* Maximum audio duration in seconds for the synchronous recognize endpoint.
|
|
22
|
+
* For longer audio, use the async longrunningrecognize endpoint (not yet implemented).
|
|
23
|
+
*/
|
|
24
|
+
readonly maxAudioDuration = 60;
|
|
25
|
+
/**
|
|
26
|
+
* True streaming requires gRPC (not yet implemented).
|
|
27
|
+
* transcribeStream() uses a chunk-and-batch workaround.
|
|
28
|
+
*/
|
|
29
|
+
readonly supportsStreaming = false;
|
|
30
|
+
constructor(apiKey?: string, credentialsPath?: string);
|
|
31
|
+
isConfigured(): boolean;
|
|
32
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
33
|
+
getSupportedLanguages(): Promise<STTLanguage[]>;
|
|
34
|
+
transcribe(audio: Buffer | ArrayBuffer, options?: STTOptions): Promise<STTResult>;
|
|
35
|
+
/**
|
|
36
|
+
* Streaming transcription (placeholder - requires WebSocket/gRPC)
|
|
37
|
+
*/
|
|
38
|
+
transcribeStream(audioStream: AsyncIterable<Buffer>, options: STTOptions): AsyncIterable<TranscriptionSegment>;
|
|
39
|
+
/**
|
|
40
|
+
* Get encoding string for audio format
|
|
41
|
+
*/
|
|
42
|
+
private getEncoding;
|
|
43
|
+
/**
|
|
44
|
+
* Parse duration string (e.g., "1.5s") to seconds
|
|
45
|
+
*/
|
|
46
|
+
private parseDuration;
|
|
47
|
+
/**
|
|
48
|
+
* Calculate average confidence from results
|
|
49
|
+
*/
|
|
50
|
+
private calculateAverageConfidence;
|
|
51
|
+
/**
|
|
52
|
+
* Get access token from service account credentials.
|
|
53
|
+
*
|
|
54
|
+
* M3: previously caught all errors and returned `""`, which then caused
|
|
55
|
+
* a silent 401 from the Google API and a confusing downstream HTTP error
|
|
56
|
+
* with no trace of the original auth failure. Now rethrows as STTError so
|
|
57
|
+
* the caller sees the auth root cause.
|
|
58
|
+
*/
|
|
59
|
+
private getAccessToken;
|
|
60
|
+
}
|