@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Realtime Voice API Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of bidirectional voice communication using OpenAI's Realtime API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAIRealtime
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { RealtimeError } from "../errors.js";
|
|
10
|
+
import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
|
|
11
|
+
/**
|
|
12
|
+
* OpenAI Realtime API Handler
|
|
13
|
+
*
|
|
14
|
+
* Implements bidirectional voice communication with OpenAI's Realtime API.
|
|
15
|
+
*
|
|
16
|
+
* @see https://platform.openai.com/docs/api-reference/realtime
|
|
17
|
+
*/
|
|
18
|
+
export class OpenAIRealtime extends BaseRealtimeHandler {
|
|
19
|
+
name = "openai-realtime";
|
|
20
|
+
apiKey;
|
|
21
|
+
ws = null;
|
|
22
|
+
audioChunkIndex = 0;
|
|
23
|
+
constructor(apiKey) {
|
|
24
|
+
super();
|
|
25
|
+
// Match the trim+null-coerce pattern used by sibling providers
|
|
26
|
+
// (OpenAITTS/AzureSTT/AzureTTS/ElevenLabsTTS/GeminiLive/OpenAISTT/GoogleSTT)
|
|
27
|
+
// so empty/whitespace `OPENAI_API_KEY=""` surfaces as PROVIDER_NOT_CONFIGURED
|
|
28
|
+
// instead of a downstream 401, and `isConfigured()` agrees with `connect()`.
|
|
29
|
+
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
|
|
30
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
31
|
+
}
|
|
32
|
+
isConfigured() {
|
|
33
|
+
return this.apiKey !== null;
|
|
34
|
+
}
|
|
35
|
+
getSupportedFormats() {
|
|
36
|
+
// Session uses pcm16 for both input_audio_format and output_audio_format,
|
|
37
|
+
// and audio-delta/audio-done chunks are tagged as `format: "pcm16"`. The
|
|
38
|
+
// advertised list must match what's actually emitted.
|
|
39
|
+
return ["pcm16"];
|
|
40
|
+
}
|
|
41
|
+
async connect(config) {
|
|
42
|
+
if (!this.apiKey) {
|
|
43
|
+
throw RealtimeError.providerNotConfigured("openai-realtime");
|
|
44
|
+
}
|
|
45
|
+
if (this.isConnected()) {
|
|
46
|
+
throw RealtimeError.sessionAlreadyActive("openai-realtime");
|
|
47
|
+
}
|
|
48
|
+
this.emitStateChange("connecting");
|
|
49
|
+
try {
|
|
50
|
+
// Import WebSocket
|
|
51
|
+
const { default: WebSocket } = await import("ws");
|
|
52
|
+
// Determine model
|
|
53
|
+
const model = config.model ?? "gpt-4o-realtime-preview-2024-12-17";
|
|
54
|
+
// Connect to OpenAI Realtime API
|
|
55
|
+
const wsUrl = `wss://api.openai.com/v1/realtime?model=${model}`;
|
|
56
|
+
this.ws = new WebSocket(wsUrl, {
|
|
57
|
+
headers: {
|
|
58
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
59
|
+
"OpenAI-Beta": "realtime=v1",
|
|
60
|
+
},
|
|
61
|
+
});
|
|
62
|
+
// Wait for connection. Capture a local reference so the closure
|
|
63
|
+
// doesn't need a non-null assertion on `this.ws` (Issue 9).
|
|
64
|
+
const ws = this.ws;
|
|
65
|
+
await new Promise((resolve, reject) => {
|
|
66
|
+
const timeout = setTimeout(() => {
|
|
67
|
+
reject(new Error("Connection timeout"));
|
|
68
|
+
}, config.timeout ?? 30000);
|
|
69
|
+
ws.on("open", () => {
|
|
70
|
+
clearTimeout(timeout);
|
|
71
|
+
resolve();
|
|
72
|
+
});
|
|
73
|
+
ws.on("error", (err) => {
|
|
74
|
+
clearTimeout(timeout);
|
|
75
|
+
reject(err);
|
|
76
|
+
});
|
|
77
|
+
});
|
|
78
|
+
// Set up message handler
|
|
79
|
+
this.ws.on("message", (data) => {
|
|
80
|
+
this.handleMessage(data);
|
|
81
|
+
});
|
|
82
|
+
this.ws.on("close", () => {
|
|
83
|
+
this.emitStateChange("disconnected");
|
|
84
|
+
this.session = null;
|
|
85
|
+
});
|
|
86
|
+
this.ws.on("error", (err) => {
|
|
87
|
+
this.emitError(err);
|
|
88
|
+
});
|
|
89
|
+
// Send session update with configuration
|
|
90
|
+
await this.sendSessionUpdate(config);
|
|
91
|
+
// Wait for session.created event
|
|
92
|
+
const sessionId = await this.waitForSessionCreated();
|
|
93
|
+
// Create session object
|
|
94
|
+
this.session = this.createSession(sessionId, config);
|
|
95
|
+
this.emitStateChange("connected");
|
|
96
|
+
logger.info(`[OpenAIRealtimeHandler] Connected to session: ${sessionId}`);
|
|
97
|
+
return this.session;
|
|
98
|
+
}
|
|
99
|
+
catch (err) {
|
|
100
|
+
this.emitStateChange("error");
|
|
101
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
102
|
+
throw RealtimeError.connectionFailed(errorMessage, "openai-realtime", err instanceof Error ? err : undefined);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
async disconnect() {
|
|
106
|
+
if (!this.ws) {
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
this.emitStateChange("disconnecting");
|
|
110
|
+
try {
|
|
111
|
+
this.ws.close();
|
|
112
|
+
this.ws = null;
|
|
113
|
+
this.session = null;
|
|
114
|
+
this.audioChunkIndex = 0;
|
|
115
|
+
this.emitStateChange("disconnected");
|
|
116
|
+
logger.info("[OpenAIRealtimeHandler] Disconnected");
|
|
117
|
+
}
|
|
118
|
+
catch (err) {
|
|
119
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
120
|
+
throw RealtimeError.protocolError(`Disconnect failed: ${errorMessage}`, "openai-realtime", err instanceof Error ? err : undefined);
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
async sendAudio(audio) {
|
|
124
|
+
if (!this.ws || !this.isConnected()) {
|
|
125
|
+
throw RealtimeError.sessionNotActive("openai-realtime");
|
|
126
|
+
}
|
|
127
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : audio.data;
|
|
128
|
+
// Send audio append event
|
|
129
|
+
const event = {
|
|
130
|
+
type: "input_audio_buffer.append",
|
|
131
|
+
audio: audioBuffer.toString("base64"),
|
|
132
|
+
};
|
|
133
|
+
this.ws.send(JSON.stringify(event));
|
|
134
|
+
}
|
|
135
|
+
async sendText(text) {
|
|
136
|
+
if (!this.ws || !this.isConnected()) {
|
|
137
|
+
throw RealtimeError.sessionNotActive("openai-realtime");
|
|
138
|
+
}
|
|
139
|
+
// Send conversation item create event
|
|
140
|
+
const event = {
|
|
141
|
+
type: "conversation.item.create",
|
|
142
|
+
item: {
|
|
143
|
+
type: "message",
|
|
144
|
+
role: "user",
|
|
145
|
+
content: [
|
|
146
|
+
{
|
|
147
|
+
type: "input_text",
|
|
148
|
+
text,
|
|
149
|
+
},
|
|
150
|
+
],
|
|
151
|
+
},
|
|
152
|
+
};
|
|
153
|
+
this.ws.send(JSON.stringify(event));
|
|
154
|
+
// Trigger response
|
|
155
|
+
await this.triggerResponse();
|
|
156
|
+
}
|
|
157
|
+
async triggerResponse() {
|
|
158
|
+
if (!this.ws || !this.isConnected()) {
|
|
159
|
+
throw RealtimeError.sessionNotActive("openai-realtime");
|
|
160
|
+
}
|
|
161
|
+
// Commit audio buffer
|
|
162
|
+
this.ws.send(JSON.stringify({
|
|
163
|
+
type: "input_audio_buffer.commit",
|
|
164
|
+
}));
|
|
165
|
+
// Create response
|
|
166
|
+
this.ws.send(JSON.stringify({
|
|
167
|
+
type: "response.create",
|
|
168
|
+
}));
|
|
169
|
+
}
|
|
170
|
+
async cancelResponse() {
|
|
171
|
+
if (!this.ws || !this.isConnected()) {
|
|
172
|
+
return;
|
|
173
|
+
}
|
|
174
|
+
this.ws.send(JSON.stringify({
|
|
175
|
+
type: "response.cancel",
|
|
176
|
+
}));
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Send session update with configuration
|
|
180
|
+
*/
|
|
181
|
+
async sendSessionUpdate(config) {
|
|
182
|
+
if (!this.ws) {
|
|
183
|
+
return;
|
|
184
|
+
}
|
|
185
|
+
const sessionConfig = {
|
|
186
|
+
modalities: ["text", "audio"],
|
|
187
|
+
input_audio_format: "pcm16",
|
|
188
|
+
output_audio_format: "pcm16",
|
|
189
|
+
input_audio_transcription: {
|
|
190
|
+
model: "whisper-1",
|
|
191
|
+
},
|
|
192
|
+
};
|
|
193
|
+
// Add voice if specified
|
|
194
|
+
if (config.voice) {
|
|
195
|
+
sessionConfig.voice = config.voice;
|
|
196
|
+
}
|
|
197
|
+
// Add turn detection
|
|
198
|
+
if (config.turnDetection) {
|
|
199
|
+
sessionConfig.turn_detection = {
|
|
200
|
+
type: config.turnDetection,
|
|
201
|
+
threshold: config.vadThreshold ?? 0.5,
|
|
202
|
+
prefix_padding_ms: 300,
|
|
203
|
+
silence_duration_ms: 500,
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
// Add system prompt
|
|
207
|
+
if (config.systemPrompt) {
|
|
208
|
+
sessionConfig.instructions = config.systemPrompt;
|
|
209
|
+
}
|
|
210
|
+
// Add tools
|
|
211
|
+
if (config.tools && config.tools.length > 0) {
|
|
212
|
+
sessionConfig.tools = config.tools.map((tool) => ({
|
|
213
|
+
type: "function",
|
|
214
|
+
name: tool.name,
|
|
215
|
+
description: tool.description,
|
|
216
|
+
parameters: tool.parameters,
|
|
217
|
+
}));
|
|
218
|
+
}
|
|
219
|
+
const event = {
|
|
220
|
+
type: "session.update",
|
|
221
|
+
session: sessionConfig,
|
|
222
|
+
};
|
|
223
|
+
this.ws.send(JSON.stringify(event));
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* Wait for session.created event
|
|
227
|
+
*/
|
|
228
|
+
waitForSessionCreated() {
|
|
229
|
+
return new Promise((resolve, reject) => {
|
|
230
|
+
const handler = (data) => {
|
|
231
|
+
try {
|
|
232
|
+
const event = JSON.parse(data.toString());
|
|
233
|
+
if (event.type === "session.created") {
|
|
234
|
+
clearTimeout(timeout);
|
|
235
|
+
this.ws?.off("message", handler);
|
|
236
|
+
const sessionEvent = event;
|
|
237
|
+
resolve(sessionEvent.session.id);
|
|
238
|
+
}
|
|
239
|
+
else if (event.type === "error") {
|
|
240
|
+
clearTimeout(timeout);
|
|
241
|
+
this.ws?.off("message", handler);
|
|
242
|
+
reject(new Error(event.error?.message ??
|
|
243
|
+
"Unknown error"));
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
catch {
|
|
247
|
+
// Ignore parse errors
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
const timeout = setTimeout(() => {
|
|
251
|
+
// M1: detach the message handler before rejecting so subsequent
|
|
252
|
+
// OpenAI Realtime messages don't invoke a dangling handler for the
|
|
253
|
+
// connection lifetime. (The success and event-error paths above
|
|
254
|
+
// already off-detach; only the timeout path was leaking.)
|
|
255
|
+
this.ws?.off("message", handler);
|
|
256
|
+
reject(new Error("Timeout waiting for session.created"));
|
|
257
|
+
}, 10000);
|
|
258
|
+
this.ws?.on("message", handler);
|
|
259
|
+
});
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Handle incoming WebSocket messages
|
|
263
|
+
*/
|
|
264
|
+
handleMessage(data) {
|
|
265
|
+
try {
|
|
266
|
+
const event = JSON.parse(data.toString());
|
|
267
|
+
switch (event.type) {
|
|
268
|
+
case "response.audio.delta": {
|
|
269
|
+
const audioEvent = event;
|
|
270
|
+
const audioData = Buffer.from(audioEvent.delta, "base64");
|
|
271
|
+
this.emitAudio({
|
|
272
|
+
data: audioData,
|
|
273
|
+
index: this.audioChunkIndex++,
|
|
274
|
+
isFinal: false,
|
|
275
|
+
// M7: session is configured with output_audio_format "pcm16",
|
|
276
|
+
// so OpenAI sends raw 16-bit PCM, not WAV-headered bytes.
|
|
277
|
+
// Tagging as "pcm16" prevents downstream consumers (e.g.
|
|
278
|
+
// calculateWavDuration) from mis-parsing the buffer as RIFF/WAV.
|
|
279
|
+
format: "pcm16",
|
|
280
|
+
sampleRate: 24000,
|
|
281
|
+
});
|
|
282
|
+
break;
|
|
283
|
+
}
|
|
284
|
+
case "response.audio.done": {
|
|
285
|
+
// Audio stream complete
|
|
286
|
+
this.emitAudio({
|
|
287
|
+
data: Buffer.alloc(0),
|
|
288
|
+
index: this.audioChunkIndex++,
|
|
289
|
+
isFinal: true,
|
|
290
|
+
// M7: session is configured with output_audio_format "pcm16",
|
|
291
|
+
// so OpenAI sends raw 16-bit PCM, not WAV-headered bytes.
|
|
292
|
+
// Tagging as "pcm16" prevents downstream consumers (e.g.
|
|
293
|
+
// calculateWavDuration) from mis-parsing the buffer as RIFF/WAV.
|
|
294
|
+
format: "pcm16",
|
|
295
|
+
sampleRate: 24000,
|
|
296
|
+
});
|
|
297
|
+
break;
|
|
298
|
+
}
|
|
299
|
+
case "response.audio_transcript.delta": {
|
|
300
|
+
const transcriptEvent = event;
|
|
301
|
+
if (transcriptEvent.delta) {
|
|
302
|
+
this.emitText(transcriptEvent.delta, false);
|
|
303
|
+
}
|
|
304
|
+
break;
|
|
305
|
+
}
|
|
306
|
+
case "response.audio_transcript.done": {
|
|
307
|
+
// Final transcript
|
|
308
|
+
const finalEvent = event;
|
|
309
|
+
if (finalEvent.transcript) {
|
|
310
|
+
this.emitText(finalEvent.transcript, true);
|
|
311
|
+
}
|
|
312
|
+
break;
|
|
313
|
+
}
|
|
314
|
+
case "conversation.item.input_audio_transcription.completed": {
|
|
315
|
+
const transcriptEvent = event;
|
|
316
|
+
if (transcriptEvent.transcript) {
|
|
317
|
+
this.emitTranscript(transcriptEvent.transcript, true);
|
|
318
|
+
}
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
case "response.function_call_arguments.done": {
|
|
322
|
+
const funcEvent = event;
|
|
323
|
+
if (funcEvent.name && funcEvent.call_id && funcEvent.arguments) {
|
|
324
|
+
try {
|
|
325
|
+
const args = JSON.parse(funcEvent.arguments);
|
|
326
|
+
// NEW6: defense-in-depth. handleFunctionCall already wraps its
|
|
327
|
+
// body in try/catch, so the inner path is covered today. This
|
|
328
|
+
// outer .catch is here to ensure any future un-caught path
|
|
329
|
+
// (e.g. a refactor that drops the inner catch, or `logger.error`
|
|
330
|
+
// itself throwing inside that catch) doesn't crash the process
|
|
331
|
+
// or hang the session via an unhandled-rejection. Issue 5.
|
|
332
|
+
void this.handleFunctionCall(funcEvent.name, args, funcEvent.call_id).catch((err) => {
|
|
333
|
+
logger.error(`[OpenAIRealtimeHandler] handleFunctionCall failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
334
|
+
});
|
|
335
|
+
}
|
|
336
|
+
catch {
|
|
337
|
+
logger.warn("[OpenAIRealtimeHandler] Failed to parse function arguments");
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
break;
|
|
341
|
+
}
|
|
342
|
+
case "response.done": {
|
|
343
|
+
this.emitTurnEnd();
|
|
344
|
+
this.audioChunkIndex = 0;
|
|
345
|
+
break;
|
|
346
|
+
}
|
|
347
|
+
case "input_audio_buffer.speech_started": {
|
|
348
|
+
this.emitTurnStart();
|
|
349
|
+
break;
|
|
350
|
+
}
|
|
351
|
+
case "error": {
|
|
352
|
+
const errorEvent = event;
|
|
353
|
+
const errorMessage = errorEvent.error?.message ?? "Unknown error";
|
|
354
|
+
this.emitError(new Error(errorMessage));
|
|
355
|
+
break;
|
|
356
|
+
}
|
|
357
|
+
default:
|
|
358
|
+
// Log unhandled events at debug level
|
|
359
|
+
logger.debug(`[OpenAIRealtimeHandler] Unhandled event: ${event.type}`);
|
|
360
|
+
}
|
|
361
|
+
}
|
|
362
|
+
catch (err) {
|
|
363
|
+
logger.warn(`[OpenAIRealtimeHandler] Failed to parse message: ${err instanceof Error ? err.message : String(err)}`);
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
/**
|
|
367
|
+
* Handle function call from model
|
|
368
|
+
*/
|
|
369
|
+
async handleFunctionCall(name, args, callId) {
|
|
370
|
+
try {
|
|
371
|
+
const result = await this.emitFunctionCall(name, args);
|
|
372
|
+
// Send function result back
|
|
373
|
+
if (this.ws && this.isConnected()) {
|
|
374
|
+
this.ws.send(JSON.stringify({
|
|
375
|
+
type: "conversation.item.create",
|
|
376
|
+
item: {
|
|
377
|
+
type: "function_call_output",
|
|
378
|
+
call_id: callId,
|
|
379
|
+
output: JSON.stringify(result),
|
|
380
|
+
},
|
|
381
|
+
}));
|
|
382
|
+
// Trigger response with function result
|
|
383
|
+
await this.triggerResponse();
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
catch (err) {
|
|
387
|
+
const errMessage = err instanceof Error ? err.message : String(err);
|
|
388
|
+
logger.error(`[OpenAIRealtimeHandler] Function call failed: ${errMessage}`);
|
|
389
|
+
// M6: send a function_call_output with the error so OpenAI Realtime
|
|
390
|
+
// can resume the turn. Without this the session stalls indefinitely
|
|
391
|
+
// (the model waits for a function result before continuing) and the
|
|
392
|
+
// user hears silence.
|
|
393
|
+
if (this.ws && this.isConnected()) {
|
|
394
|
+
try {
|
|
395
|
+
this.ws.send(JSON.stringify({
|
|
396
|
+
type: "conversation.item.create",
|
|
397
|
+
item: {
|
|
398
|
+
type: "function_call_output",
|
|
399
|
+
call_id: callId,
|
|
400
|
+
output: JSON.stringify({ error: errMessage }),
|
|
401
|
+
},
|
|
402
|
+
}));
|
|
403
|
+
await this.triggerResponse();
|
|
404
|
+
}
|
|
405
|
+
catch (sendErr) {
|
|
406
|
+
logger.error(`[OpenAIRealtimeHandler] Failed to send error result for ${callId}: ${sendErr instanceof Error ? sendErr.message : String(sendErr)}`);
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Whisper Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using OpenAI's Whisper model.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAISTT
|
|
7
|
+
*/
|
|
8
|
+
import type { TTSAudioFormat, STTHandler, STTLanguage, STTOptions, STTResult } from "../../types/index.js";
|
|
9
|
+
/**
|
|
10
|
+
* OpenAI Whisper Speech-to-Text Handler
|
|
11
|
+
*
|
|
12
|
+
* Supports transcription and translation using OpenAI's Whisper model.
|
|
13
|
+
*
|
|
14
|
+
* @see https://platform.openai.com/docs/api-reference/audio
|
|
15
|
+
*/
|
|
16
|
+
export declare class OpenAISTT implements STTHandler {
|
|
17
|
+
private readonly apiKey;
|
|
18
|
+
private readonly baseUrl;
|
|
19
|
+
/**
|
|
20
|
+
* Maximum audio duration in seconds (25 minutes)
|
|
21
|
+
*/
|
|
22
|
+
readonly maxAudioDuration: number;
|
|
23
|
+
/**
|
|
24
|
+
* Whisper does not support streaming
|
|
25
|
+
*/
|
|
26
|
+
readonly supportsStreaming = false;
|
|
27
|
+
constructor(apiKey?: string);
|
|
28
|
+
isConfigured(): boolean;
|
|
29
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
30
|
+
getSupportedLanguages(): Promise<STTLanguage[]>;
|
|
31
|
+
transcribe(audio: Buffer | ArrayBuffer, options?: STTOptions): Promise<STTResult>;
|
|
32
|
+
/**
|
|
33
|
+
* Get MIME type for audio format. Whisper auto-detects from headers, but
|
|
34
|
+
* sending a correct MIME helps providers / proxies that sniff Content-Type.
|
|
35
|
+
* Must stay aligned with `getSupportedFormats()`.
|
|
36
|
+
*/
|
|
37
|
+
private getMimeType;
|
|
38
|
+
}
|
|
39
|
+
export { OpenAISTT as WhisperSTT };
|
|
40
|
+
export { OpenAISTT as WhisperSTTHandler };
|
|
41
|
+
export { OpenAISTT as OpenAISTTHandler };
|