@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -0,0 +1,454 @@
1
+ /**
2
+ * Google Cloud Speech-to-Text Handler
3
+ *
4
+ * Implementation of STT using Google Cloud Speech-to-Text API.
5
+ *
6
+ * @module voice/providers/GoogleSTT
7
+ */
8
+ import { logger } from "../../utils/logger.js";
9
+ import { STTError } from "../errors.js";
10
+ /**
11
+ * Google Cloud Speech-to-Text Handler
12
+ *
13
+ * Supports transcription with speaker diarization, word timestamps, and punctuation.
14
+ *
15
+ * @see https://cloud.google.com/speech-to-text/docs
16
+ */
17
+ export class GoogleSTT {
18
+ apiKey;
19
+ credentialsPath;
20
+ baseUrl = "https://speech.googleapis.com/v1";
21
+ /**
22
+ * Maximum audio duration in seconds for the synchronous recognize endpoint.
23
+ * For longer audio, use the async longrunningrecognize endpoint (not yet implemented).
24
+ */
25
+ maxAudioDuration = 60;
26
+ /**
27
+ * True streaming requires gRPC (not yet implemented).
28
+ * transcribeStream() uses a chunk-and-batch workaround.
29
+ */
30
+ supportsStreaming = false;
31
+ constructor(apiKey, credentialsPath) {
32
+ // Accept GOOGLE_AI_API_KEY / GEMINI_API_KEY as aliases since `.env.example`
33
+ // documents those as the canonical Google credentials and forcing users to
34
+ // also set GOOGLE_API_KEY just for STT was a footgun (Copilot review).
35
+ const resolvedKey = (apiKey ??
36
+ process.env.GOOGLE_API_KEY ??
37
+ process.env.GOOGLE_AI_API_KEY ??
38
+ process.env.GEMINI_API_KEY ??
39
+ "").trim();
40
+ this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
41
+ const resolvedCreds = (credentialsPath ??
42
+ process.env.GOOGLE_APPLICATION_CREDENTIALS ??
43
+ "").trim();
44
+ this.credentialsPath = resolvedCreds.length > 0 ? resolvedCreds : null;
45
+ }
46
+ isConfigured() {
47
+ return this.apiKey !== null || this.credentialsPath !== null;
48
+ }
49
+ getSupportedFormats() {
50
+ return ["mp3", "wav", "ogg", "opus"];
51
+ }
52
+ async getSupportedLanguages() {
53
+ // Return common languages supported by Google STT
54
+ return [
55
+ {
56
+ code: "en-US",
57
+ name: "English (US)",
58
+ supportsDiarization: true,
59
+ supportsPunctuation: true,
60
+ },
61
+ {
62
+ code: "en-GB",
63
+ name: "English (UK)",
64
+ supportsDiarization: true,
65
+ supportsPunctuation: true,
66
+ },
67
+ {
68
+ code: "es-ES",
69
+ name: "Spanish (Spain)",
70
+ supportsDiarization: true,
71
+ supportsPunctuation: true,
72
+ },
73
+ {
74
+ code: "es-US",
75
+ name: "Spanish (US)",
76
+ supportsDiarization: true,
77
+ supportsPunctuation: true,
78
+ },
79
+ {
80
+ code: "fr-FR",
81
+ name: "French",
82
+ supportsDiarization: true,
83
+ supportsPunctuation: true,
84
+ },
85
+ {
86
+ code: "de-DE",
87
+ name: "German",
88
+ supportsDiarization: true,
89
+ supportsPunctuation: true,
90
+ },
91
+ {
92
+ code: "it-IT",
93
+ name: "Italian",
94
+ supportsDiarization: true,
95
+ supportsPunctuation: true,
96
+ },
97
+ {
98
+ code: "pt-BR",
99
+ name: "Portuguese (Brazil)",
100
+ supportsDiarization: true,
101
+ supportsPunctuation: true,
102
+ },
103
+ {
104
+ code: "ja-JP",
105
+ name: "Japanese",
106
+ supportsDiarization: true,
107
+ supportsPunctuation: true,
108
+ },
109
+ {
110
+ code: "ko-KR",
111
+ name: "Korean",
112
+ supportsDiarization: true,
113
+ supportsPunctuation: true,
114
+ },
115
+ {
116
+ code: "zh-CN",
117
+ name: "Chinese (Simplified)",
118
+ supportsDiarization: true,
119
+ supportsPunctuation: true,
120
+ },
121
+ {
122
+ code: "zh-TW",
123
+ name: "Chinese (Traditional)",
124
+ supportsDiarization: true,
125
+ supportsPunctuation: true,
126
+ },
127
+ {
128
+ code: "ar-SA",
129
+ name: "Arabic",
130
+ supportsDiarization: true,
131
+ supportsPunctuation: true,
132
+ },
133
+ {
134
+ code: "hi-IN",
135
+ name: "Hindi",
136
+ supportsDiarization: true,
137
+ supportsPunctuation: true,
138
+ },
139
+ {
140
+ code: "ru-RU",
141
+ name: "Russian",
142
+ supportsDiarization: true,
143
+ supportsPunctuation: true,
144
+ },
145
+ ];
146
+ }
147
+ async transcribe(audio, options = {}) {
148
+ if (!this.isConfigured()) {
149
+ throw STTError.providerNotConfigured("google-stt");
150
+ }
151
+ const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
152
+ if (audioBuffer.length === 0) {
153
+ throw STTError.audioEmpty("google-stt");
154
+ }
155
+ const googleOptions = options;
156
+ const startTime = Date.now();
157
+ try {
158
+ // Build recognition config
159
+ const detectedFormat = options.format ?? "wav";
160
+ const config = {
161
+ encoding: this.getEncoding(detectedFormat),
162
+ // Omit sampleRateHertz for WAV/FLAC — the API reads it from the header.
163
+ // Hardcoding a wrong value causes "sample_rate_hertz must match WAV header" errors.
164
+ ...(detectedFormat !== "wav" && detectedFormat !== "flac"
165
+ ? { sampleRateHertz: options.sampleRate ?? 16000 }
166
+ : options.sampleRate
167
+ ? { sampleRateHertz: options.sampleRate }
168
+ : {}),
169
+ languageCode: options.language ?? "en-US",
170
+ enableAutomaticPunctuation: options.punctuation ?? true,
171
+ enableWordTimeOffsets: options.wordTimestamps ?? false,
172
+ enableWordConfidence: true,
173
+ profanityFilter: options.profanityFilter ?? false,
174
+ };
175
+ // Add model if specified
176
+ if (googleOptions.model) {
177
+ config.model = googleOptions.model;
178
+ }
179
+ // Add enhanced model option
180
+ if (googleOptions.useEnhanced) {
181
+ config.useEnhanced = true;
182
+ }
183
+ // Add diarization if requested
184
+ if (options.speakerDiarization) {
185
+ config.enableSpeakerDiarization = true;
186
+ if (options.speakerCount) {
187
+ config.diarizationSpeakerCount = options.speakerCount;
188
+ }
189
+ }
190
+ // Add max alternatives
191
+ if (googleOptions.maxAlternatives) {
192
+ config.maxAlternatives = googleOptions.maxAlternatives;
193
+ }
194
+ // Build request
195
+ const requestBody = {
196
+ config,
197
+ audio: {
198
+ content: audioBuffer.toString("base64"),
199
+ },
200
+ };
201
+ // Build URL with API key
202
+ const url = this.apiKey
203
+ ? `${this.baseUrl}/speech:recognize?key=${this.apiKey}`
204
+ : `${this.baseUrl}/speech:recognize`;
205
+ const controller = new AbortController();
206
+ const timeoutId = setTimeout(() => controller.abort(), 30000);
207
+ let response;
208
+ try {
209
+ response = await fetch(url, {
210
+ method: "POST",
211
+ headers: {
212
+ "Content-Type": "application/json",
213
+ ...(this.credentialsPath && !this.apiKey
214
+ ? { Authorization: `Bearer ${await this.getAccessToken()}` }
215
+ : {}),
216
+ },
217
+ body: JSON.stringify(requestBody),
218
+ signal: controller.signal,
219
+ });
220
+ }
221
+ catch (fetchErr) {
222
+ if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
223
+ throw STTError.transcriptionFailed("Google STT request timed out after 30 seconds", "google-stt", fetchErr);
224
+ }
225
+ throw fetchErr;
226
+ }
227
+ finally {
228
+ clearTimeout(timeoutId);
229
+ }
230
+ if (!response.ok) {
231
+ const errorData = await response
232
+ .json()
233
+ .catch(() => Object.create(null));
234
+ const errorMessage = errorData.error?.message ||
235
+ `HTTP ${response.status}`;
236
+ throw STTError.transcriptionFailed(errorMessage, "google-stt");
237
+ }
238
+ const data = (await response.json());
239
+ const latency = Date.now() - startTime;
240
+ // Handle empty results
241
+ if (!data.results || data.results.length === 0) {
242
+ return {
243
+ text: "",
244
+ confidence: 0,
245
+ language: options.language,
246
+ metadata: {
247
+ latency,
248
+ provider: "google-stt",
249
+ },
250
+ };
251
+ }
252
+ // Build result from all alternatives
253
+ const result = {
254
+ text: data.results
255
+ .map((r) => r.alternatives[0]?.transcript ?? "")
256
+ .join(" ")
257
+ .trim(),
258
+ confidence: this.calculateAverageConfidence(data.results),
259
+ language: data.results[0]?.languageCode ?? options.language,
260
+ metadata: {
261
+ latency,
262
+ provider: "google-stt",
263
+ billedTime: data.totalBilledTime,
264
+ },
265
+ };
266
+ // Add word timings
267
+ const words = [];
268
+ const speakers = new Set();
269
+ for (const resultItem of data.results) {
270
+ const alternative = resultItem.alternatives[0];
271
+ if (alternative?.words) {
272
+ for (const wordInfo of alternative.words) {
273
+ const word = {
274
+ word: wordInfo.word,
275
+ startTime: this.parseDuration(wordInfo.startTime),
276
+ endTime: this.parseDuration(wordInfo.endTime),
277
+ confidence: wordInfo.confidence,
278
+ };
279
+ if (wordInfo.speakerTag !== undefined) {
280
+ word.speaker = `Speaker ${wordInfo.speakerTag}`;
281
+ speakers.add(word.speaker);
282
+ }
283
+ words.push(word);
284
+ }
285
+ }
286
+ }
287
+ if (words.length > 0) {
288
+ result.words = words;
289
+ }
290
+ if (speakers.size > 0) {
291
+ result.speakers = Array.from(speakers);
292
+ }
293
+ // Add segments
294
+ result.segments = data.results.map((resultItem, index) => {
295
+ const alt = resultItem.alternatives[0];
296
+ return {
297
+ index,
298
+ text: alt?.transcript ?? "",
299
+ isFinal: true,
300
+ confidence: alt?.confidence ?? 0,
301
+ language: resultItem.languageCode,
302
+ };
303
+ });
304
+ logger.info(`[GoogleSTTHandler] Transcribed audio in ${latency}ms`);
305
+ return result;
306
+ }
307
+ catch (err) {
308
+ if (err instanceof STTError) {
309
+ throw err;
310
+ }
311
+ const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
312
+ logger.error(`[GoogleSTTHandler] Transcription failed: ${errorMessage}`);
313
+ throw STTError.transcriptionFailed(errorMessage, "google-stt", err instanceof Error ? err : undefined);
314
+ }
315
+ }
316
+ /**
317
+ * Streaming transcription (placeholder - requires WebSocket/gRPC)
318
+ */
319
+ async *transcribeStream(audioStream, options) {
320
+ // Google streaming STT requires gRPC or WebSocket connection
321
+ // For now, buffer and transcribe in chunks
322
+ const chunks = [];
323
+ let chunkIndex = 0;
324
+ for await (const chunk of audioStream) {
325
+ chunks.push(chunk);
326
+ // Process every ~5 seconds of audio (assuming 16kHz, 16-bit)
327
+ const bytesPerSecond = 16000 * 2; // 16kHz * 2 bytes
328
+ const totalBytes = chunks.reduce((sum, c) => sum + c.length, 0);
329
+ if (totalBytes >= bytesPerSecond * 5) {
330
+ const audio = Buffer.concat(chunks);
331
+ chunks.length = 0;
332
+ try {
333
+ const result = await this.transcribe(audio, options);
334
+ yield {
335
+ index: chunkIndex++,
336
+ text: result.text,
337
+ isFinal: false,
338
+ confidence: result.confidence,
339
+ };
340
+ }
341
+ catch (err) {
342
+ // M5: distinguish permanent (auth, schema, 4xx) from transient
343
+ // (5xx, 429, network) errors. Permanent errors retry indefinitely
344
+ // and racks up failed API calls; rethrow to terminate the stream.
345
+ // Transient errors get logged and skipped so a multi-minute audio
346
+ // stream can recover from a transient hiccup.
347
+ const msg = err instanceof Error ? err.message : String(err);
348
+ const isPermanent = /\b(401|403|404|UNAUTHENTICATED|PERMISSION_DENIED|INVALID_ARGUMENT|UNAUTHORIZED|FORBIDDEN|invalid.*credential|invalid.*key)\b/i.test(msg);
349
+ if (isPermanent) {
350
+ logger.error(`[GoogleSTTHandler] Permanent chunk error — terminating stream: ${msg}`);
351
+ throw err;
352
+ }
353
+ logger.warn(`[GoogleSTTHandler] Transient chunk failure (skipping): ${msg}`);
354
+ }
355
+ }
356
+ }
357
+ // Process remaining audio
358
+ if (chunks.length > 0) {
359
+ const audio = Buffer.concat(chunks);
360
+ try {
361
+ const result = await this.transcribe(audio, options);
362
+ yield {
363
+ index: chunkIndex,
364
+ text: result.text,
365
+ isFinal: true,
366
+ confidence: result.confidence,
367
+ };
368
+ }
369
+ catch (err) {
370
+ // Don't swallow the final chunk's terminal errors — auth/config/4xx
371
+ // failures here would otherwise look like a successful empty
372
+ // transcription, hiding the root cause from callers (CodeRabbit
373
+ // review). Mirror the permanent-vs-transient split used in the
374
+ // chunk loop above (Azure/Google share this taxonomy).
375
+ const msg = err instanceof Error ? err.message : String(err);
376
+ const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*credential|Invalid.*key|Permission|PERMISSION_DENIED|UNAUTHENTICATED|INVALID_ARGUMENT)\b/i.test(msg);
377
+ if (isPermanent) {
378
+ logger.error(`[GoogleSTTHandler] Permanent final-chunk error — surfacing: ${msg}`);
379
+ throw err;
380
+ }
381
+ logger.warn(`[GoogleSTTHandler] Final chunk transcription failed (transient): ${msg}`);
382
+ }
383
+ }
384
+ }
385
+ /**
386
+ * Get encoding string for audio format
387
+ */
388
+ getEncoding(format) {
389
+ const encodings = {
390
+ mp3: "MP3",
391
+ wav: "LINEAR16",
392
+ ogg: "OGG_OPUS",
393
+ opus: "OGG_OPUS",
394
+ };
395
+ return encodings[format] ?? "LINEAR16";
396
+ }
397
+ /**
398
+ * Parse duration string (e.g., "1.5s") to seconds
399
+ */
400
+ parseDuration(duration) {
401
+ if (!duration) {
402
+ return 0;
403
+ }
404
+ const match = duration.match(/^([\d.]+)s$/);
405
+ return match ? parseFloat(match[1]) : 0;
406
+ }
407
+ /**
408
+ * Calculate average confidence from results
409
+ */
410
+ calculateAverageConfidence(results) {
411
+ const confidences = results
412
+ .map((r) => r.alternatives[0]?.confidence)
413
+ .filter((c) => typeof c === "number");
414
+ if (confidences.length === 0) {
415
+ return 0;
416
+ }
417
+ return confidences.reduce((sum, c) => sum + c, 0) / confidences.length;
418
+ }
419
+ /**
420
+ * Get access token from service account credentials.
421
+ *
422
+ * M3: previously caught all errors and returned `""`, which then caused
423
+ * a silent 401 from the Google API and a confusing downstream HTTP error
424
+ * with no trace of the original auth failure. Now rethrows as STTError so
425
+ * the caller sees the auth root cause.
426
+ */
427
+ async getAccessToken() {
428
+ try {
429
+ const { GoogleAuth } = await import("google-auth-library");
430
+ const auth = new GoogleAuth({
431
+ ...(this.credentialsPath ? { keyFilename: this.credentialsPath } : {}),
432
+ scopes: ["https://www.googleapis.com/auth/cloud-platform"],
433
+ });
434
+ const client = await auth.getClient();
435
+ const tokenResponse = await client.getAccessToken();
436
+ const token = tokenResponse.token;
437
+ if (!token) {
438
+ throw STTError.transcriptionFailed("Google access token returned empty — check GOOGLE_APPLICATION_CREDENTIALS path and service account permissions", "google-stt");
439
+ }
440
+ return token;
441
+ }
442
+ catch (err) {
443
+ logger.error(`[GoogleSTTHandler] Failed to acquire access token: ${err instanceof Error ? err.message : String(err)}`);
444
+ // Use instanceof — refactor-resilient and matches the pattern in
445
+ // transcribe(). The earlier `err.name === "STTError"` check would
446
+ // double-wrap if the base class ever overwrote `name`.
447
+ if (err instanceof STTError) {
448
+ throw err;
449
+ }
450
+ throw STTError.transcriptionFailed(`Google access token acquisition failed: ${err instanceof Error ? err.message : String(err)}`, "google-stt");
451
+ }
452
+ }
453
+ }
454
+ //# sourceMappingURL=GoogleSTT.js.map
@@ -0,0 +1,47 @@
1
+ /**
2
+ * OpenAI Realtime Voice API Handler
3
+ *
4
+ * Implementation of bidirectional voice communication using OpenAI's Realtime API.
5
+ *
6
+ * @module voice/providers/OpenAIRealtime
7
+ */
8
+ import { BaseRealtimeHandler } from "../RealtimeVoiceAPI.js";
9
+ import type { TTSAudioFormat, RealtimeAudioChunk, RealtimeConfig, RealtimeSession } from "../../types/index.js";
10
+ /**
11
+ * OpenAI Realtime API Handler
12
+ *
13
+ * Implements bidirectional voice communication with OpenAI's Realtime API.
14
+ *
15
+ * @see https://platform.openai.com/docs/api-reference/realtime
16
+ */
17
+ export declare class OpenAIRealtime extends BaseRealtimeHandler {
18
+ readonly name = "openai-realtime";
19
+ private readonly apiKey;
20
+ private ws;
21
+ private audioChunkIndex;
22
+ constructor(apiKey?: string);
23
+ isConfigured(): boolean;
24
+ getSupportedFormats(): TTSAudioFormat[];
25
+ connect(config: RealtimeConfig): Promise<RealtimeSession>;
26
+ disconnect(): Promise<void>;
27
+ sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
28
+ sendText(text: string): Promise<void>;
29
+ triggerResponse(): Promise<void>;
30
+ cancelResponse(): Promise<void>;
31
+ /**
32
+ * Send session update with configuration
33
+ */
34
+ private sendSessionUpdate;
35
+ /**
36
+ * Wait for session.created event
37
+ */
38
+ private waitForSessionCreated;
39
+ /**
40
+ * Handle incoming WebSocket messages
41
+ */
42
+ private handleMessage;
43
+ /**
44
+ * Handle function call from model
45
+ */
46
+ private handleFunctionCall;
47
+ }