@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech-to-Text (STT) Processing Utility
|
|
3
|
+
*
|
|
4
|
+
* Central orchestrator for all STT operations across providers.
|
|
5
|
+
* Manages provider-specific STT handlers and audio transcription.
|
|
6
|
+
*
|
|
7
|
+
* @module utils/sttProcessor
|
|
8
|
+
*/
|
|
9
|
+
import { logger } from "./logger.js";
|
|
10
|
+
import { STT_ERROR_CODES } from "../types/index.js";
|
|
11
|
+
import { ErrorCategory, ErrorSeverity } from "../constants/enums.js";
|
|
12
|
+
import { STTError } from "../voice/errors.js";
|
|
13
|
+
import { SpanSerializer, SpanType, SpanStatus, getMetricsAggregator, } from "../observability/index.js";
|
|
14
|
+
/**
|
|
15
|
+
* STT processor class for orchestrating speech-to-text operations
|
|
16
|
+
*
|
|
17
|
+
* Follows the same pattern as TTSProcessor, CSVProcessor, ImageProcessor, and PDFProcessor.
|
|
18
|
+
* Provides a unified interface for STT transcription across multiple providers.
|
|
19
|
+
*
|
|
20
|
+
* @example
|
|
21
|
+
* ```typescript
|
|
22
|
+
* // Register a handler
|
|
23
|
+
* STTProcessor.registerHandler('whisper', whisperHandler);
|
|
24
|
+
*
|
|
25
|
+
* // Check if provider is supported
|
|
26
|
+
* if (STTProcessor.supports('whisper')) {
|
|
27
|
+
* // Provider is registered
|
|
28
|
+
* }
|
|
29
|
+
* ```
|
|
30
|
+
*/
|
|
31
|
+
export class STTProcessor {
|
|
32
|
+
/**
|
|
33
|
+
* Handler registry mapping provider names to STT handlers
|
|
34
|
+
* Uses Map for O(1) lookups and better type safety
|
|
35
|
+
*
|
|
36
|
+
* @private
|
|
37
|
+
*/
|
|
38
|
+
static handlers = new Map();
|
|
39
|
+
/**
|
|
40
|
+
* Default maximum audio duration for STT transcription (in seconds)
|
|
41
|
+
*
|
|
42
|
+
* Providers can override this value by specifying the `maxAudioDuration` property
|
|
43
|
+
* in their respective `STTHandler` implementation. If not specified, this default
|
|
44
|
+
* value will be used (5 minutes).
|
|
45
|
+
*
|
|
46
|
+
* @private
|
|
47
|
+
*/
|
|
48
|
+
static DEFAULT_MAX_AUDIO_DURATION = 300;
|
|
49
|
+
/**
|
|
50
|
+
* Register an STT handler for a specific provider
|
|
51
|
+
*
|
|
52
|
+
* Allows providers to register their STT implementation at runtime.
|
|
53
|
+
*
|
|
54
|
+
* @param providerName - Provider identifier (e.g., 'whisper', 'deepgram')
|
|
55
|
+
* @param handler - STT handler implementation
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```typescript
|
|
59
|
+
* const whisperHandler: STTHandler = {
|
|
60
|
+
* transcribe: async (audio, options) => { ... },
|
|
61
|
+
* getSupportedFormats: () => ["mp3", "wav"],
|
|
62
|
+
* isConfigured: () => true
|
|
63
|
+
* };
|
|
64
|
+
*
|
|
65
|
+
* STTProcessor.registerHandler('whisper', whisperHandler);
|
|
66
|
+
* ```
|
|
67
|
+
*/
|
|
68
|
+
static registerHandler(providerName, handler) {
|
|
69
|
+
if (!providerName) {
|
|
70
|
+
throw new Error("Provider name is required");
|
|
71
|
+
}
|
|
72
|
+
if (!handler) {
|
|
73
|
+
throw new Error("Handler is required");
|
|
74
|
+
}
|
|
75
|
+
const normalizedName = providerName.toLowerCase();
|
|
76
|
+
if (this.handlers.has(normalizedName)) {
|
|
77
|
+
logger.warn(`[STTProcessor] Overwriting existing handler for provider: ${normalizedName}`);
|
|
78
|
+
}
|
|
79
|
+
this.handlers.set(normalizedName, handler);
|
|
80
|
+
logger.debug(`[STTProcessor] Registered STT handler for provider: ${normalizedName}`);
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Get a registered STT handler by provider name
|
|
84
|
+
*
|
|
85
|
+
* @private
|
|
86
|
+
* @param providerName - Provider identifier
|
|
87
|
+
* @returns Handler instance or undefined if not registered
|
|
88
|
+
*/
|
|
89
|
+
static getHandler(providerName) {
|
|
90
|
+
const normalizedName = providerName.toLowerCase();
|
|
91
|
+
return this.handlers.get(normalizedName);
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Check if a provider is supported (has a registered STT handler)
|
|
95
|
+
*
|
|
96
|
+
* @param providerName - Provider identifier
|
|
97
|
+
* @returns True if handler is registered
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```typescript
|
|
101
|
+
* if (STTProcessor.supports('whisper')) {
|
|
102
|
+
* console.log('Whisper STT is supported');
|
|
103
|
+
* }
|
|
104
|
+
* ```
|
|
105
|
+
*/
|
|
106
|
+
static supports(providerName) {
|
|
107
|
+
if (!providerName) {
|
|
108
|
+
logger.error("[STTProcessor] Provider name is required for supports check");
|
|
109
|
+
return false;
|
|
110
|
+
}
|
|
111
|
+
const normalizedName = providerName.toLowerCase();
|
|
112
|
+
const isSupported = this.handlers.has(normalizedName);
|
|
113
|
+
if (!isSupported) {
|
|
114
|
+
logger.debug(`[STTProcessor] Provider ${providerName} is not supported`);
|
|
115
|
+
}
|
|
116
|
+
return isSupported;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Transcribe audio to text using a registered STT provider
|
|
120
|
+
*
|
|
121
|
+
* Orchestrates the speech-to-text transcription process:
|
|
122
|
+
* 1. Validates audio input (non-empty)
|
|
123
|
+
* 2. Looks up the provider handler
|
|
124
|
+
* 3. Verifies provider configuration
|
|
125
|
+
* 4. Delegates transcription to the provider
|
|
126
|
+
* 5. Enriches result with provider metadata
|
|
127
|
+
*
|
|
128
|
+
* @param audio - Audio data as Buffer or ArrayBuffer
|
|
129
|
+
* @param provider - Provider identifier
|
|
130
|
+
* @param options - STT configuration options
|
|
131
|
+
* @returns Transcription result with text and metadata
|
|
132
|
+
* @throws STTError if validation fails or provider not supported/configured
|
|
133
|
+
*
|
|
134
|
+
* @example
|
|
135
|
+
* ```typescript
|
|
136
|
+
* const result = await STTProcessor.transcribe(audioBuffer, "whisper", {
|
|
137
|
+
* language: "en-US",
|
|
138
|
+
* punctuation: true,
|
|
139
|
+
* });
|
|
140
|
+
*
|
|
141
|
+
* console.log(`Transcription: ${result.text}`);
|
|
142
|
+
* console.log(`Confidence: ${result.confidence}`);
|
|
143
|
+
* ```
|
|
144
|
+
*/
|
|
145
|
+
static async transcribe(audio, provider, options) {
|
|
146
|
+
// Create span early so preflight failures are captured
|
|
147
|
+
const span = SpanSerializer.createSpan(SpanType.STT, "stt.transcribe", {
|
|
148
|
+
"stt.operation": "transcribe",
|
|
149
|
+
"stt.provider": provider,
|
|
150
|
+
"stt.language": options.language,
|
|
151
|
+
"stt.format": options.format,
|
|
152
|
+
});
|
|
153
|
+
try {
|
|
154
|
+
// 1. Audio validation: reject empty + oversized audio
|
|
155
|
+
const byteLength = audio instanceof ArrayBuffer ? audio.byteLength : audio.length;
|
|
156
|
+
if (!byteLength || byteLength === 0) {
|
|
157
|
+
logger.error("[STTProcessor] Audio data is required for transcription");
|
|
158
|
+
throw new STTError({
|
|
159
|
+
code: STT_ERROR_CODES.AUDIO_EMPTY,
|
|
160
|
+
message: "Audio data is required for STT transcription",
|
|
161
|
+
severity: ErrorSeverity.LOW,
|
|
162
|
+
retriable: false,
|
|
163
|
+
context: { provider },
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
// NEW13: enforce a size upper bound so a multi-GB Buffer can't OOM the
|
|
167
|
+
// process. Default 25 MB matches Whisper's documented limit; callers
|
|
168
|
+
// can override via `options.maxAudioBytes`. Permanent errors at the
|
|
169
|
+
// provider level (e.g. Whisper rejecting >25MB) become this clean
|
|
170
|
+
// STTError instead of a memory crash or vendor 413.
|
|
171
|
+
const maxAudioBytes = options.maxAudioBytes ?? 25_000_000;
|
|
172
|
+
if (byteLength > maxAudioBytes) {
|
|
173
|
+
logger.error(`[STTProcessor] Audio buffer ${byteLength} bytes exceeds limit ${maxAudioBytes}`);
|
|
174
|
+
throw new STTError({
|
|
175
|
+
code: STT_ERROR_CODES.AUDIO_TOO_LONG,
|
|
176
|
+
message: `Audio buffer ${byteLength} bytes exceeds maximum ${maxAudioBytes} bytes for STT transcription. Increase maxAudioBytes in options or chunk the audio.`,
|
|
177
|
+
severity: ErrorSeverity.HIGH,
|
|
178
|
+
retriable: false,
|
|
179
|
+
context: { provider, byteLength, maxAudioBytes },
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
// 2. Handler lookup and error if provider not supported
|
|
183
|
+
const handler = this.getHandler(provider);
|
|
184
|
+
if (!handler) {
|
|
185
|
+
logger.error(`[STTProcessor] Provider "${provider}" is not registered`);
|
|
186
|
+
throw new STTError({
|
|
187
|
+
code: STT_ERROR_CODES.PROVIDER_NOT_SUPPORTED,
|
|
188
|
+
message: `STT provider "${provider}" is not supported. Use STTProcessor.registerHandler() to register it.`,
|
|
189
|
+
severity: ErrorSeverity.HIGH,
|
|
190
|
+
retriable: false,
|
|
191
|
+
context: {
|
|
192
|
+
provider,
|
|
193
|
+
availableProviders: Array.from(this.handlers.keys()),
|
|
194
|
+
},
|
|
195
|
+
});
|
|
196
|
+
}
|
|
197
|
+
// 3. Format compatibility check — fail fast when the caller passes
|
|
198
|
+
// an audio format the provider explicitly does not decode (e.g. MP3 to
|
|
199
|
+
// azure-stt). Without this, providers like Azure return a Success
|
|
200
|
+
// response with empty text, which then cascades into a confusing
|
|
201
|
+
// "prompt must be at least 1 character long" failure on the downstream
|
|
202
|
+
// LLM call. We only validate when both `options.format` and
|
|
203
|
+
// `handler.getSupportedFormats()` are present so we never block providers
|
|
204
|
+
// that prefer to do their own detection.
|
|
205
|
+
if (options.format && typeof handler.getSupportedFormats === "function") {
|
|
206
|
+
const supported = handler.getSupportedFormats();
|
|
207
|
+
if (Array.isArray(supported) &&
|
|
208
|
+
supported.length > 0 &&
|
|
209
|
+
!supported.includes(options.format)) {
|
|
210
|
+
logger.error(`[STTProcessor] Provider "${provider}" does not support audio format "${options.format}"`);
|
|
211
|
+
throw new STTError({
|
|
212
|
+
code: STT_ERROR_CODES.INVALID_AUDIO_FORMAT,
|
|
213
|
+
message: `STT provider "${provider}" does not support audio format "${options.format}". Supported formats: ${supported.join(", ")}.`,
|
|
214
|
+
severity: ErrorSeverity.HIGH,
|
|
215
|
+
retriable: false,
|
|
216
|
+
context: {
|
|
217
|
+
provider,
|
|
218
|
+
requestedFormat: options.format,
|
|
219
|
+
supportedFormats: supported,
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
// 4. Configuration check
|
|
225
|
+
if (!handler.isConfigured()) {
|
|
226
|
+
logger.warn(`[STTProcessor] Provider "${provider}" is not properly configured`);
|
|
227
|
+
throw new STTError({
|
|
228
|
+
code: STT_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
|
|
229
|
+
message: `STT provider "${provider}" is not configured. Please set the required API keys.`,
|
|
230
|
+
category: ErrorCategory.CONFIGURATION,
|
|
231
|
+
severity: ErrorSeverity.HIGH,
|
|
232
|
+
retriable: false,
|
|
233
|
+
context: { provider },
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
logger.debug(`[STTProcessor] Starting transcription with provider: ${provider}`);
|
|
237
|
+
// 5. Call handler.transcribe() - providers handle their own timeouts
|
|
238
|
+
const result = await handler.transcribe(audio, options);
|
|
239
|
+
// 6. Post-processing: enrich result with provider metadata
|
|
240
|
+
const enrichedResult = {
|
|
241
|
+
...result,
|
|
242
|
+
metadata: {
|
|
243
|
+
...result.metadata,
|
|
244
|
+
provider,
|
|
245
|
+
latency: result.metadata?.latency ?? 0,
|
|
246
|
+
},
|
|
247
|
+
};
|
|
248
|
+
// Don't log transcript content at INFO — voice transcriptions can carry
|
|
249
|
+
// PII / health / financial data, and INFO is typically persisted in
|
|
250
|
+
// production log aggregation (CloudWatch, Datadog, etc.). GDPR / CCPA
|
|
251
|
+
// concern. Length and provider are safe to record.
|
|
252
|
+
logger.debug(`[STTProcessor] Transcription completed for provider "${provider}" (${result.text.length} chars)`);
|
|
253
|
+
// 7. Record successful span
|
|
254
|
+
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.OK);
|
|
255
|
+
getMetricsAggregator().recordSpan(endedSpan);
|
|
256
|
+
// 8. Return STTResult with text, confidence, metadata
|
|
257
|
+
return enrichedResult;
|
|
258
|
+
}
|
|
259
|
+
catch (err) {
|
|
260
|
+
// Record error span
|
|
261
|
+
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.ERROR, err instanceof Error ? err.message : String(err));
|
|
262
|
+
getMetricsAggregator().recordSpan(endedSpan);
|
|
263
|
+
// Re-throw STTError as-is
|
|
264
|
+
if (err instanceof STTError) {
|
|
265
|
+
throw err;
|
|
266
|
+
}
|
|
267
|
+
// Wrap other errors in STTError
|
|
268
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
269
|
+
logger.error(`[STTProcessor] Transcription failed for provider "${provider}": ${errorMessage}`);
|
|
270
|
+
throw new STTError({
|
|
271
|
+
code: STT_ERROR_CODES.TRANSCRIPTION_FAILED,
|
|
272
|
+
message: `STT transcription failed for provider "${provider}": ${errorMessage}`,
|
|
273
|
+
category: ErrorCategory.EXECUTION,
|
|
274
|
+
severity: ErrorSeverity.HIGH,
|
|
275
|
+
retriable: true,
|
|
276
|
+
context: {
|
|
277
|
+
provider,
|
|
278
|
+
audioByteLength: audio instanceof ArrayBuffer ? audio.byteLength : audio.length,
|
|
279
|
+
// Sanitize: strip free-text user-supplied fields (e.g. WhisperSTTOptions.prompt)
|
|
280
|
+
// from the error context so error-monitoring pipelines (Sentry, Datadog APM)
|
|
281
|
+
// don't ingest user audio prompt text.
|
|
282
|
+
options: {
|
|
283
|
+
format: options.format,
|
|
284
|
+
language: options.language,
|
|
285
|
+
wordTimestamps: options.wordTimestamps,
|
|
286
|
+
maxAudioBytes: options.maxAudioBytes,
|
|
287
|
+
speakerDiarization: options.speakerDiarization,
|
|
288
|
+
},
|
|
289
|
+
},
|
|
290
|
+
originalError: err instanceof Error ? err : undefined,
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime Voice API Infrastructure
|
|
3
|
+
*
|
|
4
|
+
* Base handler and processor for realtime voice communication.
|
|
5
|
+
* Supports bidirectional audio streaming with providers like OpenAI and Gemini.
|
|
6
|
+
*
|
|
7
|
+
* @module voice/RealtimeVoiceAPI
|
|
8
|
+
*/
|
|
9
|
+
import type { TTSAudioFormat, RealtimeAudioChunk, RealtimeConfig, RealtimeEventHandlers, RealtimeHandler, RealtimeSession, RealtimeSessionState } from "../types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Realtime Processor class for orchestrating realtime voice operations
|
|
12
|
+
*
|
|
13
|
+
* Provides a unified interface for realtime voice across multiple providers.
|
|
14
|
+
*
|
|
15
|
+
* @example
|
|
16
|
+
* ```typescript
|
|
17
|
+
* // Register a handler (typically done in providerRegistry.ts on startup)
|
|
18
|
+
* RealtimeProcessor.registerHandler('openai-realtime', openaiHandler);
|
|
19
|
+
*
|
|
20
|
+
* // Connect to a session — the first arg is the registered handler key,
|
|
21
|
+
* // and `config.provider` must match the same key.
|
|
22
|
+
* const session = await RealtimeProcessor.connect('openai-realtime', {
|
|
23
|
+
* provider: 'openai-realtime',
|
|
24
|
+
* voice: 'alloy',
|
|
25
|
+
* systemPrompt: 'You are a helpful assistant.'
|
|
26
|
+
* });
|
|
27
|
+
*
|
|
28
|
+
* // Send audio
|
|
29
|
+
* await RealtimeProcessor.sendAudio('openai-realtime', audioBuffer);
|
|
30
|
+
*
|
|
31
|
+
* // Disconnect
|
|
32
|
+
* await RealtimeProcessor.disconnect('openai-realtime');
|
|
33
|
+
* ```
|
|
34
|
+
*/
|
|
35
|
+
export declare class RealtimeProcessor {
|
|
36
|
+
/**
|
|
37
|
+
* Handler registry mapping provider names to Realtime handlers
|
|
38
|
+
*/
|
|
39
|
+
private static readonly handlers;
|
|
40
|
+
/**
|
|
41
|
+
* Active sessions by provider
|
|
42
|
+
*/
|
|
43
|
+
private static readonly sessions;
|
|
44
|
+
/**
|
|
45
|
+
* Register a Realtime handler for a specific provider
|
|
46
|
+
*
|
|
47
|
+
* @param providerName - Provider identifier (e.g., 'openai', 'gemini')
|
|
48
|
+
* @param handler - Realtime handler implementation
|
|
49
|
+
*/
|
|
50
|
+
static registerHandler(providerName: string, handler: RealtimeHandler): void;
|
|
51
|
+
/**
|
|
52
|
+
* Get a registered Realtime handler by provider name
|
|
53
|
+
*/
|
|
54
|
+
private static getHandler;
|
|
55
|
+
/**
|
|
56
|
+
* Check if a provider is supported
|
|
57
|
+
*/
|
|
58
|
+
static supports(providerName: string): boolean;
|
|
59
|
+
/**
|
|
60
|
+
* Get list of all registered providers
|
|
61
|
+
*/
|
|
62
|
+
static getProviders(): string[];
|
|
63
|
+
/**
|
|
64
|
+
* Connect to a realtime session
|
|
65
|
+
*
|
|
66
|
+
* @param provider - Provider identifier
|
|
67
|
+
* @param config - Session configuration
|
|
68
|
+
* @param handlers - Event handlers
|
|
69
|
+
* @returns Session information
|
|
70
|
+
*/
|
|
71
|
+
static connect(provider: string, config: RealtimeConfig, handlers?: RealtimeEventHandlers): Promise<RealtimeSession>;
|
|
72
|
+
/**
|
|
73
|
+
* Disconnect from a realtime session
|
|
74
|
+
*
|
|
75
|
+
* @param provider - Provider identifier
|
|
76
|
+
*/
|
|
77
|
+
static disconnect(provider: string): Promise<void>;
|
|
78
|
+
/**
|
|
79
|
+
* Send audio to a realtime session
|
|
80
|
+
*
|
|
81
|
+
* @param provider - Provider identifier
|
|
82
|
+
* @param audio - Audio data
|
|
83
|
+
*/
|
|
84
|
+
static sendAudio(provider: string, audio: Buffer | RealtimeAudioChunk): Promise<void>;
|
|
85
|
+
/**
|
|
86
|
+
* Send text to a realtime session
|
|
87
|
+
*
|
|
88
|
+
* @param provider - Provider identifier
|
|
89
|
+
* @param text - Text to send
|
|
90
|
+
*/
|
|
91
|
+
static sendText(provider: string, text: string): Promise<void>;
|
|
92
|
+
/**
|
|
93
|
+
* Trigger a response from the model (manual turn detection)
|
|
94
|
+
*
|
|
95
|
+
* @param provider - Provider identifier
|
|
96
|
+
*/
|
|
97
|
+
static triggerResponse(provider: string): Promise<void>;
|
|
98
|
+
/**
|
|
99
|
+
* Cancel the current response
|
|
100
|
+
*
|
|
101
|
+
* @param provider - Provider identifier
|
|
102
|
+
*/
|
|
103
|
+
static cancelResponse(provider: string): Promise<void>;
|
|
104
|
+
/**
|
|
105
|
+
* Get current session for a provider
|
|
106
|
+
*
|
|
107
|
+
* @param provider - Provider identifier
|
|
108
|
+
* @returns Session or null
|
|
109
|
+
*/
|
|
110
|
+
static getSession(provider: string): RealtimeSession | null;
|
|
111
|
+
/**
|
|
112
|
+
* Check if a provider has an active session
|
|
113
|
+
*
|
|
114
|
+
* @param provider - Provider identifier
|
|
115
|
+
*/
|
|
116
|
+
static isConnected(provider: string): boolean;
|
|
117
|
+
/**
|
|
118
|
+
* Get supported formats for a provider
|
|
119
|
+
*
|
|
120
|
+
* @param provider - Provider identifier
|
|
121
|
+
*/
|
|
122
|
+
static getSupportedFormats(provider: string): TTSAudioFormat[];
|
|
123
|
+
/**
|
|
124
|
+
* Clear all handlers and sessions (for testing)
|
|
125
|
+
*/
|
|
126
|
+
static clearHandlers(): void;
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Base Realtime Handler with common functionality
|
|
130
|
+
*
|
|
131
|
+
* Providers can extend this class for common behavior.
|
|
132
|
+
*/
|
|
133
|
+
export declare abstract class BaseRealtimeHandler implements RealtimeHandler {
|
|
134
|
+
abstract readonly name: RealtimeConfig["provider"];
|
|
135
|
+
protected session: RealtimeSession | null;
|
|
136
|
+
protected eventHandlers: RealtimeEventHandlers | null;
|
|
137
|
+
protected state: RealtimeSessionState;
|
|
138
|
+
abstract connect(config: RealtimeConfig): Promise<RealtimeSession>;
|
|
139
|
+
abstract disconnect(): Promise<void>;
|
|
140
|
+
abstract sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
|
|
141
|
+
abstract isConfigured(): boolean;
|
|
142
|
+
abstract getSupportedFormats(): TTSAudioFormat[];
|
|
143
|
+
isConnected(): boolean;
|
|
144
|
+
getSession(): RealtimeSession | null;
|
|
145
|
+
on(handlers: RealtimeEventHandlers): void;
|
|
146
|
+
off(): void;
|
|
147
|
+
/**
|
|
148
|
+
* Emit state change event
|
|
149
|
+
*/
|
|
150
|
+
protected emitStateChange(newState: RealtimeSessionState): void;
|
|
151
|
+
/**
|
|
152
|
+
* Emit audio event
|
|
153
|
+
*/
|
|
154
|
+
protected emitAudio(chunk: RealtimeAudioChunk): void;
|
|
155
|
+
/**
|
|
156
|
+
* Emit transcript event
|
|
157
|
+
*/
|
|
158
|
+
protected emitTranscript(text: string, isFinal: boolean): void;
|
|
159
|
+
/**
|
|
160
|
+
* Emit text event
|
|
161
|
+
*/
|
|
162
|
+
protected emitText(text: string, isFinal: boolean): void;
|
|
163
|
+
/**
|
|
164
|
+
* Emit function call event
|
|
165
|
+
*/
|
|
166
|
+
protected emitFunctionCall(name: string, args: Record<string, unknown>): Promise<unknown>;
|
|
167
|
+
/**
|
|
168
|
+
* Emit error event
|
|
169
|
+
*/
|
|
170
|
+
protected emitError(error: Error): void;
|
|
171
|
+
/**
|
|
172
|
+
* Emit turn start event
|
|
173
|
+
*/
|
|
174
|
+
protected emitTurnStart(): void;
|
|
175
|
+
/**
|
|
176
|
+
* Emit turn end event
|
|
177
|
+
*/
|
|
178
|
+
protected emitTurnEnd(): void;
|
|
179
|
+
/**
|
|
180
|
+
* Create a session object
|
|
181
|
+
*/
|
|
182
|
+
protected createSession(id: string, config: RealtimeConfig): RealtimeSession;
|
|
183
|
+
}
|