@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Google Cloud Speech-to-Text Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of STT using Google Cloud Speech-to-Text API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/GoogleSTT
|
|
7
|
+
*/
|
|
8
|
+
import { logger } from "../../utils/logger.js";
|
|
9
|
+
import { STTError } from "../errors.js";
|
|
10
|
+
/**
|
|
11
|
+
* Google Cloud Speech-to-Text Handler
|
|
12
|
+
*
|
|
13
|
+
* Supports transcription with speaker diarization, word timestamps, and punctuation.
|
|
14
|
+
*
|
|
15
|
+
* @see https://cloud.google.com/speech-to-text/docs
|
|
16
|
+
*/
|
|
17
|
+
export class GoogleSTT {
|
|
18
|
+
apiKey;
|
|
19
|
+
credentialsPath;
|
|
20
|
+
baseUrl = "https://speech.googleapis.com/v1";
|
|
21
|
+
/**
|
|
22
|
+
* Maximum audio duration in seconds for the synchronous recognize endpoint.
|
|
23
|
+
* For longer audio, use the async longrunningrecognize endpoint (not yet implemented).
|
|
24
|
+
*/
|
|
25
|
+
maxAudioDuration = 60;
|
|
26
|
+
/**
|
|
27
|
+
* True streaming requires gRPC (not yet implemented).
|
|
28
|
+
* transcribeStream() uses a chunk-and-batch workaround.
|
|
29
|
+
*/
|
|
30
|
+
supportsStreaming = false;
|
|
31
|
+
constructor(apiKey, credentialsPath) {
|
|
32
|
+
// Accept GOOGLE_AI_API_KEY / GEMINI_API_KEY as aliases since `.env.example`
|
|
33
|
+
// documents those as the canonical Google credentials and forcing users to
|
|
34
|
+
// also set GOOGLE_API_KEY just for STT was a footgun (Copilot review).
|
|
35
|
+
const resolvedKey = (apiKey ??
|
|
36
|
+
process.env.GOOGLE_API_KEY ??
|
|
37
|
+
process.env.GOOGLE_AI_API_KEY ??
|
|
38
|
+
process.env.GEMINI_API_KEY ??
|
|
39
|
+
"").trim();
|
|
40
|
+
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
|
|
41
|
+
const resolvedCreds = (credentialsPath ??
|
|
42
|
+
process.env.GOOGLE_APPLICATION_CREDENTIALS ??
|
|
43
|
+
"").trim();
|
|
44
|
+
this.credentialsPath = resolvedCreds.length > 0 ? resolvedCreds : null;
|
|
45
|
+
}
|
|
46
|
+
isConfigured() {
|
|
47
|
+
return this.apiKey !== null || this.credentialsPath !== null;
|
|
48
|
+
}
|
|
49
|
+
getSupportedFormats() {
|
|
50
|
+
return ["mp3", "wav", "ogg", "opus"];
|
|
51
|
+
}
|
|
52
|
+
async getSupportedLanguages() {
|
|
53
|
+
// Return common languages supported by Google STT
|
|
54
|
+
return [
|
|
55
|
+
{
|
|
56
|
+
code: "en-US",
|
|
57
|
+
name: "English (US)",
|
|
58
|
+
supportsDiarization: true,
|
|
59
|
+
supportsPunctuation: true,
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
code: "en-GB",
|
|
63
|
+
name: "English (UK)",
|
|
64
|
+
supportsDiarization: true,
|
|
65
|
+
supportsPunctuation: true,
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
code: "es-ES",
|
|
69
|
+
name: "Spanish (Spain)",
|
|
70
|
+
supportsDiarization: true,
|
|
71
|
+
supportsPunctuation: true,
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
code: "es-US",
|
|
75
|
+
name: "Spanish (US)",
|
|
76
|
+
supportsDiarization: true,
|
|
77
|
+
supportsPunctuation: true,
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
code: "fr-FR",
|
|
81
|
+
name: "French",
|
|
82
|
+
supportsDiarization: true,
|
|
83
|
+
supportsPunctuation: true,
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
code: "de-DE",
|
|
87
|
+
name: "German",
|
|
88
|
+
supportsDiarization: true,
|
|
89
|
+
supportsPunctuation: true,
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
code: "it-IT",
|
|
93
|
+
name: "Italian",
|
|
94
|
+
supportsDiarization: true,
|
|
95
|
+
supportsPunctuation: true,
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
code: "pt-BR",
|
|
99
|
+
name: "Portuguese (Brazil)",
|
|
100
|
+
supportsDiarization: true,
|
|
101
|
+
supportsPunctuation: true,
|
|
102
|
+
},
|
|
103
|
+
{
|
|
104
|
+
code: "ja-JP",
|
|
105
|
+
name: "Japanese",
|
|
106
|
+
supportsDiarization: true,
|
|
107
|
+
supportsPunctuation: true,
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
code: "ko-KR",
|
|
111
|
+
name: "Korean",
|
|
112
|
+
supportsDiarization: true,
|
|
113
|
+
supportsPunctuation: true,
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
code: "zh-CN",
|
|
117
|
+
name: "Chinese (Simplified)",
|
|
118
|
+
supportsDiarization: true,
|
|
119
|
+
supportsPunctuation: true,
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
code: "zh-TW",
|
|
123
|
+
name: "Chinese (Traditional)",
|
|
124
|
+
supportsDiarization: true,
|
|
125
|
+
supportsPunctuation: true,
|
|
126
|
+
},
|
|
127
|
+
{
|
|
128
|
+
code: "ar-SA",
|
|
129
|
+
name: "Arabic",
|
|
130
|
+
supportsDiarization: true,
|
|
131
|
+
supportsPunctuation: true,
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
code: "hi-IN",
|
|
135
|
+
name: "Hindi",
|
|
136
|
+
supportsDiarization: true,
|
|
137
|
+
supportsPunctuation: true,
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
code: "ru-RU",
|
|
141
|
+
name: "Russian",
|
|
142
|
+
supportsDiarization: true,
|
|
143
|
+
supportsPunctuation: true,
|
|
144
|
+
},
|
|
145
|
+
];
|
|
146
|
+
}
|
|
147
|
+
async transcribe(audio, options = {}) {
|
|
148
|
+
if (!this.isConfigured()) {
|
|
149
|
+
throw STTError.providerNotConfigured("google-stt");
|
|
150
|
+
}
|
|
151
|
+
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
|
|
152
|
+
if (audioBuffer.length === 0) {
|
|
153
|
+
throw STTError.audioEmpty("google-stt");
|
|
154
|
+
}
|
|
155
|
+
const googleOptions = options;
|
|
156
|
+
const startTime = Date.now();
|
|
157
|
+
try {
|
|
158
|
+
// Build recognition config
|
|
159
|
+
const detectedFormat = options.format ?? "wav";
|
|
160
|
+
const config = {
|
|
161
|
+
encoding: this.getEncoding(detectedFormat),
|
|
162
|
+
// Omit sampleRateHertz for WAV/FLAC — the API reads it from the header.
|
|
163
|
+
// Hardcoding a wrong value causes "sample_rate_hertz must match WAV header" errors.
|
|
164
|
+
...(detectedFormat !== "wav" && detectedFormat !== "flac"
|
|
165
|
+
? { sampleRateHertz: options.sampleRate ?? 16000 }
|
|
166
|
+
: options.sampleRate
|
|
167
|
+
? { sampleRateHertz: options.sampleRate }
|
|
168
|
+
: {}),
|
|
169
|
+
languageCode: options.language ?? "en-US",
|
|
170
|
+
enableAutomaticPunctuation: options.punctuation ?? true,
|
|
171
|
+
enableWordTimeOffsets: options.wordTimestamps ?? false,
|
|
172
|
+
enableWordConfidence: true,
|
|
173
|
+
profanityFilter: options.profanityFilter ?? false,
|
|
174
|
+
};
|
|
175
|
+
// Add model if specified
|
|
176
|
+
if (googleOptions.model) {
|
|
177
|
+
config.model = googleOptions.model;
|
|
178
|
+
}
|
|
179
|
+
// Add enhanced model option
|
|
180
|
+
if (googleOptions.useEnhanced) {
|
|
181
|
+
config.useEnhanced = true;
|
|
182
|
+
}
|
|
183
|
+
// Add diarization if requested
|
|
184
|
+
if (options.speakerDiarization) {
|
|
185
|
+
config.enableSpeakerDiarization = true;
|
|
186
|
+
if (options.speakerCount) {
|
|
187
|
+
config.diarizationSpeakerCount = options.speakerCount;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
// Add max alternatives
|
|
191
|
+
if (googleOptions.maxAlternatives) {
|
|
192
|
+
config.maxAlternatives = googleOptions.maxAlternatives;
|
|
193
|
+
}
|
|
194
|
+
// Build request
|
|
195
|
+
const requestBody = {
|
|
196
|
+
config,
|
|
197
|
+
audio: {
|
|
198
|
+
content: audioBuffer.toString("base64"),
|
|
199
|
+
},
|
|
200
|
+
};
|
|
201
|
+
// Build URL with API key
|
|
202
|
+
const url = this.apiKey
|
|
203
|
+
? `${this.baseUrl}/speech:recognize?key=${this.apiKey}`
|
|
204
|
+
: `${this.baseUrl}/speech:recognize`;
|
|
205
|
+
const controller = new AbortController();
|
|
206
|
+
const timeoutId = setTimeout(() => controller.abort(), 30000);
|
|
207
|
+
let response;
|
|
208
|
+
try {
|
|
209
|
+
response = await fetch(url, {
|
|
210
|
+
method: "POST",
|
|
211
|
+
headers: {
|
|
212
|
+
"Content-Type": "application/json",
|
|
213
|
+
...(this.credentialsPath && !this.apiKey
|
|
214
|
+
? { Authorization: `Bearer ${await this.getAccessToken()}` }
|
|
215
|
+
: {}),
|
|
216
|
+
},
|
|
217
|
+
body: JSON.stringify(requestBody),
|
|
218
|
+
signal: controller.signal,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
catch (fetchErr) {
|
|
222
|
+
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
|
|
223
|
+
throw STTError.transcriptionFailed("Google STT request timed out after 30 seconds", "google-stt", fetchErr);
|
|
224
|
+
}
|
|
225
|
+
throw fetchErr;
|
|
226
|
+
}
|
|
227
|
+
finally {
|
|
228
|
+
clearTimeout(timeoutId);
|
|
229
|
+
}
|
|
230
|
+
if (!response.ok) {
|
|
231
|
+
const errorData = await response
|
|
232
|
+
.json()
|
|
233
|
+
.catch(() => Object.create(null));
|
|
234
|
+
const errorMessage = errorData.error?.message ||
|
|
235
|
+
`HTTP ${response.status}`;
|
|
236
|
+
throw STTError.transcriptionFailed(errorMessage, "google-stt");
|
|
237
|
+
}
|
|
238
|
+
const data = (await response.json());
|
|
239
|
+
const latency = Date.now() - startTime;
|
|
240
|
+
// Handle empty results
|
|
241
|
+
if (!data.results || data.results.length === 0) {
|
|
242
|
+
return {
|
|
243
|
+
text: "",
|
|
244
|
+
confidence: 0,
|
|
245
|
+
language: options.language,
|
|
246
|
+
metadata: {
|
|
247
|
+
latency,
|
|
248
|
+
provider: "google-stt",
|
|
249
|
+
},
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
// Build result from all alternatives
|
|
253
|
+
const result = {
|
|
254
|
+
text: data.results
|
|
255
|
+
.map((r) => r.alternatives[0]?.transcript ?? "")
|
|
256
|
+
.join(" ")
|
|
257
|
+
.trim(),
|
|
258
|
+
confidence: this.calculateAverageConfidence(data.results),
|
|
259
|
+
language: data.results[0]?.languageCode ?? options.language,
|
|
260
|
+
metadata: {
|
|
261
|
+
latency,
|
|
262
|
+
provider: "google-stt",
|
|
263
|
+
billedTime: data.totalBilledTime,
|
|
264
|
+
},
|
|
265
|
+
};
|
|
266
|
+
// Add word timings
|
|
267
|
+
const words = [];
|
|
268
|
+
const speakers = new Set();
|
|
269
|
+
for (const resultItem of data.results) {
|
|
270
|
+
const alternative = resultItem.alternatives[0];
|
|
271
|
+
if (alternative?.words) {
|
|
272
|
+
for (const wordInfo of alternative.words) {
|
|
273
|
+
const word = {
|
|
274
|
+
word: wordInfo.word,
|
|
275
|
+
startTime: this.parseDuration(wordInfo.startTime),
|
|
276
|
+
endTime: this.parseDuration(wordInfo.endTime),
|
|
277
|
+
confidence: wordInfo.confidence,
|
|
278
|
+
};
|
|
279
|
+
if (wordInfo.speakerTag !== undefined) {
|
|
280
|
+
word.speaker = `Speaker ${wordInfo.speakerTag}`;
|
|
281
|
+
speakers.add(word.speaker);
|
|
282
|
+
}
|
|
283
|
+
words.push(word);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
if (words.length > 0) {
|
|
288
|
+
result.words = words;
|
|
289
|
+
}
|
|
290
|
+
if (speakers.size > 0) {
|
|
291
|
+
result.speakers = Array.from(speakers);
|
|
292
|
+
}
|
|
293
|
+
// Add segments
|
|
294
|
+
result.segments = data.results.map((resultItem, index) => {
|
|
295
|
+
const alt = resultItem.alternatives[0];
|
|
296
|
+
return {
|
|
297
|
+
index,
|
|
298
|
+
text: alt?.transcript ?? "",
|
|
299
|
+
isFinal: true,
|
|
300
|
+
confidence: alt?.confidence ?? 0,
|
|
301
|
+
language: resultItem.languageCode,
|
|
302
|
+
};
|
|
303
|
+
});
|
|
304
|
+
logger.info(`[GoogleSTTHandler] Transcribed audio in ${latency}ms`);
|
|
305
|
+
return result;
|
|
306
|
+
}
|
|
307
|
+
catch (err) {
|
|
308
|
+
if (err instanceof STTError) {
|
|
309
|
+
throw err;
|
|
310
|
+
}
|
|
311
|
+
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
|
|
312
|
+
logger.error(`[GoogleSTTHandler] Transcription failed: ${errorMessage}`);
|
|
313
|
+
throw STTError.transcriptionFailed(errorMessage, "google-stt", err instanceof Error ? err : undefined);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
/**
|
|
317
|
+
* Streaming transcription (placeholder - requires WebSocket/gRPC)
|
|
318
|
+
*/
|
|
319
|
+
async *transcribeStream(audioStream, options) {
|
|
320
|
+
// Google streaming STT requires gRPC or WebSocket connection
|
|
321
|
+
// For now, buffer and transcribe in chunks
|
|
322
|
+
const chunks = [];
|
|
323
|
+
let chunkIndex = 0;
|
|
324
|
+
for await (const chunk of audioStream) {
|
|
325
|
+
chunks.push(chunk);
|
|
326
|
+
// Process every ~5 seconds of audio (assuming 16kHz, 16-bit)
|
|
327
|
+
const bytesPerSecond = 16000 * 2; // 16kHz * 2 bytes
|
|
328
|
+
const totalBytes = chunks.reduce((sum, c) => sum + c.length, 0);
|
|
329
|
+
if (totalBytes >= bytesPerSecond * 5) {
|
|
330
|
+
const audio = Buffer.concat(chunks);
|
|
331
|
+
chunks.length = 0;
|
|
332
|
+
try {
|
|
333
|
+
const result = await this.transcribe(audio, options);
|
|
334
|
+
yield {
|
|
335
|
+
index: chunkIndex++,
|
|
336
|
+
text: result.text,
|
|
337
|
+
isFinal: false,
|
|
338
|
+
confidence: result.confidence,
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
catch (err) {
|
|
342
|
+
// M5: distinguish permanent (auth, schema, 4xx) from transient
|
|
343
|
+
// (5xx, 429, network) errors. Permanent errors retry indefinitely
|
|
344
|
+
// and racks up failed API calls; rethrow to terminate the stream.
|
|
345
|
+
// Transient errors get logged and skipped so a multi-minute audio
|
|
346
|
+
// stream can recover from a transient hiccup.
|
|
347
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
348
|
+
const isPermanent = /\b(401|403|404|UNAUTHENTICATED|PERMISSION_DENIED|INVALID_ARGUMENT|UNAUTHORIZED|FORBIDDEN|invalid.*credential|invalid.*key)\b/i.test(msg);
|
|
349
|
+
if (isPermanent) {
|
|
350
|
+
logger.error(`[GoogleSTTHandler] Permanent chunk error — terminating stream: ${msg}`);
|
|
351
|
+
throw err;
|
|
352
|
+
}
|
|
353
|
+
logger.warn(`[GoogleSTTHandler] Transient chunk failure (skipping): ${msg}`);
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
// Process remaining audio
|
|
358
|
+
if (chunks.length > 0) {
|
|
359
|
+
const audio = Buffer.concat(chunks);
|
|
360
|
+
try {
|
|
361
|
+
const result = await this.transcribe(audio, options);
|
|
362
|
+
yield {
|
|
363
|
+
index: chunkIndex,
|
|
364
|
+
text: result.text,
|
|
365
|
+
isFinal: true,
|
|
366
|
+
confidence: result.confidence,
|
|
367
|
+
};
|
|
368
|
+
}
|
|
369
|
+
catch (err) {
|
|
370
|
+
// Don't swallow the final chunk's terminal errors — auth/config/4xx
|
|
371
|
+
// failures here would otherwise look like a successful empty
|
|
372
|
+
// transcription, hiding the root cause from callers (CodeRabbit
|
|
373
|
+
// review). Mirror the permanent-vs-transient split used in the
|
|
374
|
+
// chunk loop above (Azure/Google share this taxonomy).
|
|
375
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
376
|
+
const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*credential|Invalid.*key|Permission|PERMISSION_DENIED|UNAUTHENTICATED|INVALID_ARGUMENT)\b/i.test(msg);
|
|
377
|
+
if (isPermanent) {
|
|
378
|
+
logger.error(`[GoogleSTTHandler] Permanent final-chunk error — surfacing: ${msg}`);
|
|
379
|
+
throw err;
|
|
380
|
+
}
|
|
381
|
+
logger.warn(`[GoogleSTTHandler] Final chunk transcription failed (transient): ${msg}`);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Get encoding string for audio format
|
|
387
|
+
*/
|
|
388
|
+
getEncoding(format) {
|
|
389
|
+
const encodings = {
|
|
390
|
+
mp3: "MP3",
|
|
391
|
+
wav: "LINEAR16",
|
|
392
|
+
ogg: "OGG_OPUS",
|
|
393
|
+
opus: "OGG_OPUS",
|
|
394
|
+
};
|
|
395
|
+
return encodings[format] ?? "LINEAR16";
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Parse duration string (e.g., "1.5s") to seconds
|
|
399
|
+
*/
|
|
400
|
+
parseDuration(duration) {
|
|
401
|
+
if (!duration) {
|
|
402
|
+
return 0;
|
|
403
|
+
}
|
|
404
|
+
const match = duration.match(/^([\d.]+)s$/);
|
|
405
|
+
return match ? parseFloat(match[1]) : 0;
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Calculate average confidence from results
|
|
409
|
+
*/
|
|
410
|
+
calculateAverageConfidence(results) {
|
|
411
|
+
const confidences = results
|
|
412
|
+
.map((r) => r.alternatives[0]?.confidence)
|
|
413
|
+
.filter((c) => typeof c === "number");
|
|
414
|
+
if (confidences.length === 0) {
|
|
415
|
+
return 0;
|
|
416
|
+
}
|
|
417
|
+
return confidences.reduce((sum, c) => sum + c, 0) / confidences.length;
|
|
418
|
+
}
|
|
419
|
+
/**
|
|
420
|
+
* Get access token from service account credentials.
|
|
421
|
+
*
|
|
422
|
+
* M3: previously caught all errors and returned `""`, which then caused
|
|
423
|
+
* a silent 401 from the Google API and a confusing downstream HTTP error
|
|
424
|
+
* with no trace of the original auth failure. Now rethrows as STTError so
|
|
425
|
+
* the caller sees the auth root cause.
|
|
426
|
+
*/
|
|
427
|
+
async getAccessToken() {
|
|
428
|
+
try {
|
|
429
|
+
const { GoogleAuth } = await import("google-auth-library");
|
|
430
|
+
const auth = new GoogleAuth({
|
|
431
|
+
...(this.credentialsPath ? { keyFilename: this.credentialsPath } : {}),
|
|
432
|
+
scopes: ["https://www.googleapis.com/auth/cloud-platform"],
|
|
433
|
+
});
|
|
434
|
+
const client = await auth.getClient();
|
|
435
|
+
const tokenResponse = await client.getAccessToken();
|
|
436
|
+
const token = tokenResponse.token;
|
|
437
|
+
if (!token) {
|
|
438
|
+
throw STTError.transcriptionFailed("Google access token returned empty — check GOOGLE_APPLICATION_CREDENTIALS path and service account permissions", "google-stt");
|
|
439
|
+
}
|
|
440
|
+
return token;
|
|
441
|
+
}
|
|
442
|
+
catch (err) {
|
|
443
|
+
logger.error(`[GoogleSTTHandler] Failed to acquire access token: ${err instanceof Error ? err.message : String(err)}`);
|
|
444
|
+
// Use instanceof — refactor-resilient and matches the pattern in
|
|
445
|
+
// transcribe(). The earlier `err.name === "STTError"` check would
|
|
446
|
+
// double-wrap if the base class ever overwrote `name`.
|
|
447
|
+
if (err instanceof STTError) {
|
|
448
|
+
throw err;
|
|
449
|
+
}
|
|
450
|
+
throw STTError.transcriptionFailed(`Google access token acquisition failed: ${err instanceof Error ? err.message : String(err)}`, "google-stt");
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI Realtime Voice API Handler
|
|
3
|
+
*
|
|
4
|
+
* Implementation of bidirectional voice communication using OpenAI's Realtime API.
|
|
5
|
+
*
|
|
6
|
+
* @module voice/providers/OpenAIRealtime
|
|
7
|
+
*/
|
|
8
|
+
import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
|
|
9
|
+
import type { TTSAudioFormat, RealtimeAudioChunk, RealtimeConfig, RealtimeSession } from "../../types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* OpenAI Realtime API Handler
|
|
12
|
+
*
|
|
13
|
+
* Implements bidirectional voice communication with OpenAI's Realtime API.
|
|
14
|
+
*
|
|
15
|
+
* @see https://platform.openai.com/docs/api-reference/realtime
|
|
16
|
+
*/
|
|
17
|
+
export declare class OpenAIRealtime extends BaseRealtimeHandler {
|
|
18
|
+
readonly name = "openai-realtime";
|
|
19
|
+
private readonly apiKey;
|
|
20
|
+
private ws;
|
|
21
|
+
private audioChunkIndex;
|
|
22
|
+
constructor(apiKey?: string);
|
|
23
|
+
isConfigured(): boolean;
|
|
24
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
25
|
+
connect(config: RealtimeConfig): Promise<RealtimeSession>;
|
|
26
|
+
disconnect(): Promise<void>;
|
|
27
|
+
sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
|
|
28
|
+
sendText(text: string): Promise<void>;
|
|
29
|
+
triggerResponse(): Promise<void>;
|
|
30
|
+
cancelResponse(): Promise<void>;
|
|
31
|
+
/**
|
|
32
|
+
* Send session update with configuration
|
|
33
|
+
*/
|
|
34
|
+
private sendSessionUpdate;
|
|
35
|
+
/**
|
|
36
|
+
* Wait for session.created event
|
|
37
|
+
*/
|
|
38
|
+
private waitForSessionCreated;
|
|
39
|
+
/**
|
|
40
|
+
* Handle incoming WebSocket messages
|
|
41
|
+
*/
|
|
42
|
+
private handleMessage;
|
|
43
|
+
/**
|
|
44
|
+
* Handle function call from model
|
|
45
|
+
*/
|
|
46
|
+
private handleFunctionCall;
|
|
47
|
+
}
|