@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -0,0 +1,585 @@
1
+ /**
2
+ * Speech-to-Text (STT) Type Definitions for NeuroLink
3
+ *
4
+ * All STT-specific types: options, results, handlers,
5
+ * provider-specific options, error codes, defaults, and type guards.
6
+ *
7
+ * @module types/stt
8
+ */
9
+ import type { TTSAudioFormat } from "./tts.js";
10
+ /**
11
+ * STT configuration options
12
+ */
13
+ export type STTOptions = {
14
+ /** Enable STT processing */
15
+ enabled?: boolean;
16
+ /** Override STT provider */
17
+ provider?: string;
18
+ /** Language code for transcription (e.g., "en-US") */
19
+ language?: string;
20
+ /** Audio format of input */
21
+ format?: TTSAudioFormat;
22
+ /** Sample rate in Hz */
23
+ sampleRate?: number;
24
+ /** Enable punctuation in transcription */
25
+ punctuation?: boolean;
26
+ /** Enable punctuation (alias) */
27
+ punctuate?: boolean;
28
+ /** Enable profanity filter */
29
+ profanityFilter?: boolean;
30
+ /** Enable speaker diarization */
31
+ speakerDiarization?: boolean;
32
+ /** Enable speaker diarization (alias) */
33
+ diarization?: boolean;
34
+ /** Number of speakers (for diarization) */
35
+ speakerCount?: number;
36
+ /** Enable word-level timestamps */
37
+ wordTimestamps?: boolean;
38
+ /** Model variant to use */
39
+ model?: string;
40
+ /** Custom vocabulary/phrases */
41
+ vocabulary?: string[];
42
+ /** Minimum confidence threshold */
43
+ confidenceThreshold?: number;
44
+ /**
45
+ * Maximum audio buffer size in bytes. STTProcessor rejects buffers over
46
+ * this limit before any provider call, preventing OOM on multi-GB inputs.
47
+ * Default: 25_000_000 (matches Whisper's documented 25MB ceiling).
48
+ */
49
+ maxAudioBytes?: number;
50
+ };
51
+ /**
52
+ * STT result from transcription
53
+ */
54
+ export type STTResult = {
55
+ /** Full transcribed text */
56
+ text: string;
57
+ /** Confidence score (0-1) */
58
+ confidence: number;
59
+ /** Detected language code */
60
+ language?: string;
61
+ /** Audio duration in seconds */
62
+ duration?: number;
63
+ /** Word-level timings */
64
+ words?: WordTiming[];
65
+ /** Transcription segments */
66
+ segments?: TranscriptionSegment[];
67
+ /** Speaker labels (for diarization) */
68
+ speakers?: string[];
69
+ /** Performance metadata */
70
+ metadata?: {
71
+ /** Processing latency in milliseconds */
72
+ latency: number;
73
+ /** Provider name */
74
+ provider?: string;
75
+ /** Model used */
76
+ model?: string;
77
+ /** Additional provider-specific metadata */
78
+ [key: string]: unknown;
79
+ };
80
+ };
81
+ /**
82
+ * STT language information
83
+ */
84
+ export type STTLanguage = {
85
+ /** Language code (e.g., "en-US") */
86
+ code: string;
87
+ /** Language name */
88
+ name: string;
89
+ /** Whether the language supports speaker diarization */
90
+ supportsDiarization?: boolean;
91
+ /** Whether the language supports punctuation */
92
+ supportsPunctuation?: boolean;
93
+ };
94
+ /**
95
+ * Word-level timing information
96
+ */
97
+ export type WordTiming = {
98
+ /** The word */
99
+ word: string;
100
+ /** Start time in seconds */
101
+ startTime?: number;
102
+ /** Start time alias */
103
+ start?: number;
104
+ /** End time in seconds */
105
+ endTime?: number;
106
+ /** End time alias */
107
+ end?: number;
108
+ /** Confidence score (0-1) */
109
+ confidence?: number;
110
+ /** Speaker label (for diarization) */
111
+ speaker?: string;
112
+ };
113
+ /**
114
+ * Transcription segment for streaming STT
115
+ */
116
+ export type TranscriptionSegment = {
117
+ /** Segment index */
118
+ index?: number;
119
+ /** Transcribed text */
120
+ text: string;
121
+ /** Whether this is a final result */
122
+ isFinal: boolean;
123
+ /** Confidence score (0-1) */
124
+ confidence?: number;
125
+ /** Start time in audio (seconds) */
126
+ startTime?: number;
127
+ /** Start time (alias for startTime) */
128
+ start?: number;
129
+ /** End time in audio (seconds) */
130
+ endTime?: number;
131
+ /** End time (alias for endTime) */
132
+ end?: number;
133
+ /** Word-level timings */
134
+ words?: WordTiming[];
135
+ /** Speaker label */
136
+ speaker?: string;
137
+ /** Detected language */
138
+ language?: string;
139
+ };
140
+ export type STTHandler = {
141
+ transcribe(audio: Buffer | ArrayBuffer, options: STTOptions): Promise<STTResult>;
142
+ transcribeStream?(audioStream: AsyncIterable<Buffer>, options: STTOptions): AsyncIterable<TranscriptionSegment>;
143
+ getSupportedLanguages?(): Promise<STTLanguage[]>;
144
+ getSupportedFormats(): TTSAudioFormat[];
145
+ isConfigured(): boolean;
146
+ maxAudioDuration?: number;
147
+ supportsStreaming?: boolean;
148
+ };
149
+ /**
150
+ * STT error codes
151
+ */
152
+ export declare const STT_ERROR_CODES: {
153
+ readonly AUDIO_EMPTY: "STT_AUDIO_EMPTY";
154
+ readonly AUDIO_TOO_LONG: "STT_AUDIO_TOO_LONG";
155
+ readonly INVALID_AUDIO_FORMAT: "STT_INVALID_AUDIO_FORMAT";
156
+ readonly LANGUAGE_NOT_SUPPORTED: "STT_LANGUAGE_NOT_SUPPORTED";
157
+ readonly TRANSCRIPTION_FAILED: "STT_TRANSCRIPTION_FAILED";
158
+ readonly PROVIDER_NOT_CONFIGURED: "STT_PROVIDER_NOT_CONFIGURED";
159
+ readonly PROVIDER_NOT_SUPPORTED: "STT_PROVIDER_NOT_SUPPORTED";
160
+ readonly STREAM_ERROR: "STT_STREAM_ERROR";
161
+ readonly STREAMING_NOT_SUPPORTED: "STT_STREAMING_NOT_SUPPORTED";
162
+ };
163
+ /**
164
+ * Default STT options
165
+ */
166
+ export declare const DEFAULT_STT_OPTIONS: Required<Pick<STTOptions, "language" | "punctuation" | "profanityFilter" | "sampleRate">>;
167
+ /**
168
+ * Type guard for STTResult
169
+ */
170
+ export declare function isSTTResult(value: unknown): value is STTResult;
171
+ /**
172
+ * Type guard for valid STTOptions
173
+ */
174
+ export declare function isValidSTTOptions(options: unknown): options is STTOptions;
175
+ /**
176
+ * Type guard for TranscriptionSegment
177
+ */
178
+ export declare function isTranscriptionSegment(value: unknown): value is TranscriptionSegment;
179
+ export type AzureRecognitionMode = "interactive" | "conversation" | "dictation";
180
+ export type AzureOutputFormat = "simple" | "detailed";
181
+ export type AzureSTTOptions = STTOptions & {
182
+ recognitionMode?: AzureRecognitionMode;
183
+ outputFormat?: AzureOutputFormat;
184
+ interimResults?: boolean;
185
+ endpointId?: string;
186
+ /** Custom endpoint ID (alias for endpointId) */
187
+ customEndpointId?: string;
188
+ connectionTimeout?: number;
189
+ silenceTimeout?: number;
190
+ profanityOption?: "masked" | "removed" | "raw";
191
+ /** Profanity mode (alias for profanityOption) */
192
+ profanityMode?: "masked" | "removed" | "raw";
193
+ initialSilenceTimeout?: number;
194
+ enableLogging?: boolean;
195
+ phraseList?: string[];
196
+ /** Whether to request detailed output format */
197
+ detailed?: boolean;
198
+ wordLevelConfidence?: boolean;
199
+ initialSilenceTimeoutMs?: number;
200
+ endSilenceTimeoutMs?: number;
201
+ };
202
+ export type DeepgramModel = "nova-2" | "nova-2-general" | "nova-2-meeting" | "nova-2-phonecall" | "nova-2-voicemail" | "nova-2-finance" | "nova-2-medical" | "nova" | "enhanced" | "base";
203
+ export type DeepgramSTTOptions = STTOptions & {
204
+ model?: DeepgramModel | "nova-3";
205
+ smartFormat?: boolean;
206
+ search?: string[];
207
+ replace?: Array<{
208
+ find: string;
209
+ replace: string;
210
+ }>;
211
+ utterances?: boolean;
212
+ utterSplit?: number;
213
+ /** Alias for utterSplit (legacy field name) */
214
+ uttSplit?: number;
215
+ paragraphs?: boolean;
216
+ keywords?: string[];
217
+ keywordBoost?: "legacy" | "medium" | "high";
218
+ fillerWords?: boolean;
219
+ detectTopics?: boolean;
220
+ detectEntities?: boolean;
221
+ summarize?: boolean;
222
+ redact?: ("pci" | "numbers" | "ssn")[];
223
+ };
224
+ export type GoogleSTTModel = "latest_short" | "latest_long" | "telephony" | "medical_conversation" | "medical_dictation" | "command_and_search" | "phone_call" | "video" | "default";
225
+ export type GoogleSTTAudioEncoding = "ENCODING_UNSPECIFIED" | "LINEAR16" | "FLAC" | "MULAW" | "AMR" | "AMR_WB" | "OGG_OPUS" | "SPEEX_WITH_HEADER_BYTE" | "MP3" | "WEBM_OPUS";
226
+ export type GoogleSTTOptions = STTOptions & {
227
+ model?: GoogleSTTModel;
228
+ encoding?: GoogleSTTAudioEncoding;
229
+ sampleRateHertz?: number;
230
+ audioChannelCount?: number;
231
+ enableSeparateRecognitionPerChannel?: boolean;
232
+ alternativeLanguageCodes?: string[];
233
+ maxAlternatives?: number;
234
+ enableAutomaticPunctuation?: boolean;
235
+ enableSpokenPunctuation?: boolean;
236
+ enableSpokenEmojis?: boolean;
237
+ speechContexts?: Array<{
238
+ phrases: string[];
239
+ boost?: number;
240
+ }>;
241
+ adaptation?: {
242
+ phraseSets?: string[];
243
+ customClasses?: string[];
244
+ };
245
+ useEnhanced?: boolean;
246
+ keywords?: string[];
247
+ };
248
+ export type WhisperModel = "whisper-1";
249
+ export type WhisperSTTOptions = STTOptions & {
250
+ model?: WhisperModel;
251
+ responseFormat?: "json" | "text" | "srt" | "verbose_json" | "vtt";
252
+ temperature?: number;
253
+ prompt?: string;
254
+ /** Translate audio to English instead of transcribing in original language */
255
+ translate?: boolean;
256
+ };
257
+ export type AzureWord = {
258
+ Word: string;
259
+ Offset: number;
260
+ Duration: number;
261
+ Confidence?: number;
262
+ };
263
+ export type AzureNBest = {
264
+ Confidence: number;
265
+ Lexical: string;
266
+ ITN: string;
267
+ MaskedITN: string;
268
+ Display: string;
269
+ Words?: AzureWord[];
270
+ };
271
+ export type AzureRecognitionResult = {
272
+ RecognitionStatus: "Success" | "NoMatch" | "InitialSilenceTimeout" | "BabbleTimeout" | "Error" | string;
273
+ Offset?: number;
274
+ Duration?: number;
275
+ DisplayText?: string;
276
+ NBest?: AzureNBest[];
277
+ };
278
+ export type AzureSpeakerRecognitionResult = AzureRecognitionResult & {
279
+ SpeakerId?: string;
280
+ };
281
+ export type DeepgramWord = {
282
+ word: string;
283
+ start: number;
284
+ end: number;
285
+ confidence: number;
286
+ speaker?: number;
287
+ punctuated_word?: string;
288
+ };
289
+ export type DeepgramAlternative = {
290
+ transcript: string;
291
+ confidence: number;
292
+ words: DeepgramWord[];
293
+ paragraphs?: {
294
+ transcript: string;
295
+ paragraphs: Array<{
296
+ sentences: Array<{
297
+ text: string;
298
+ start: number;
299
+ end: number;
300
+ }>;
301
+ }>;
302
+ };
303
+ };
304
+ export type DeepgramChannel = {
305
+ alternatives: DeepgramAlternative[];
306
+ };
307
+ export type DeepgramUtterance = {
308
+ start: number;
309
+ end: number;
310
+ confidence: number;
311
+ channel: number;
312
+ transcript: string;
313
+ words: DeepgramWord[];
314
+ speaker?: number;
315
+ id?: string;
316
+ };
317
+ export type DeepgramResult = {
318
+ channels: DeepgramChannel[];
319
+ utterances?: DeepgramUtterance[];
320
+ };
321
+ export type DeepgramResponse = {
322
+ metadata: {
323
+ request_id: string;
324
+ transaction_key?: string;
325
+ sha256?: string;
326
+ created: string;
327
+ duration: number;
328
+ channels: number;
329
+ models: string[];
330
+ model_info?: Record<string, {
331
+ name: string;
332
+ version: string;
333
+ }>;
334
+ };
335
+ results: DeepgramResult;
336
+ };
337
+ export type GoogleWordInfo = {
338
+ startTime: string;
339
+ endTime: string;
340
+ word: string;
341
+ confidence?: number;
342
+ speakerTag?: number;
343
+ };
344
+ export type GoogleSpeechRecognitionAlternative = {
345
+ transcript: string;
346
+ confidence: number;
347
+ words?: GoogleWordInfo[];
348
+ };
349
+ export type GoogleSpeechRecognitionResult = {
350
+ alternatives: GoogleSpeechRecognitionAlternative[];
351
+ channelTag?: number;
352
+ languageCode?: string;
353
+ resultEndTime?: string;
354
+ };
355
+ export type GoogleLongRunningRecognizeResponse = {
356
+ results: GoogleSpeechRecognitionResult[];
357
+ totalBilledTime?: string;
358
+ };
359
+ export type GoogleRecognizeResponse = {
360
+ results?: GoogleSpeechRecognitionResult[];
361
+ totalBilledTime?: string;
362
+ };
363
+ export type GoogleOperationResponse = {
364
+ name: string;
365
+ done: boolean;
366
+ metadata?: {
367
+ progressPercent?: number;
368
+ startTime?: string;
369
+ lastUpdateTime?: string;
370
+ };
371
+ response?: GoogleLongRunningRecognizeResponse;
372
+ error?: {
373
+ code: number;
374
+ message: string;
375
+ };
376
+ };
377
+ export type GoogleRecognitionConfig = {
378
+ encoding: string;
379
+ sampleRateHertz?: number;
380
+ languageCode: string;
381
+ enableAutomaticPunctuation?: boolean;
382
+ enableWordTimeOffsets?: boolean;
383
+ enableWordConfidence?: boolean;
384
+ model?: string;
385
+ useEnhanced?: boolean;
386
+ maxAlternatives?: number;
387
+ profanityFilter?: boolean;
388
+ enableSpeakerDiarization?: boolean;
389
+ diarizationSpeakerCount?: number;
390
+ };
391
+ export type GoogleRecognitionAudio = {
392
+ content: string;
393
+ };
394
+ export type WhisperTranscriptionWord = {
395
+ word: string;
396
+ start: number;
397
+ end: number;
398
+ };
399
+ export type WhisperTranscriptionSegment = {
400
+ id: number;
401
+ seek: number;
402
+ start: number;
403
+ end: number;
404
+ text: string;
405
+ tokens: number[];
406
+ temperature: number;
407
+ avg_logprob: number;
408
+ compression_ratio: number;
409
+ no_speech_prob: number;
410
+ };
411
+ export type WhisperVerboseResponse = {
412
+ task: string;
413
+ language: string;
414
+ duration: number;
415
+ text: string;
416
+ segments?: WhisperTranscriptionSegment[];
417
+ words?: WhisperTranscriptionWord[];
418
+ };
419
+ export type WhisperSimpleResponse = {
420
+ text: string;
421
+ };
422
+ export type ElevenLabsVoice = {
423
+ voice_id: string;
424
+ name: string;
425
+ category: string;
426
+ labels?: {
427
+ accent?: string;
428
+ description?: string;
429
+ age?: string;
430
+ gender?: string;
431
+ use_case?: string;
432
+ };
433
+ preview_url?: string;
434
+ };
435
+ export type ElevenLabsVoicesResponse = {
436
+ voices: ElevenLabsVoice[];
437
+ };
438
+ export type AzureVoiceInfo = {
439
+ Name: string;
440
+ DisplayName: string;
441
+ LocalName: string;
442
+ ShortName: string;
443
+ Gender: string;
444
+ Locale: string;
445
+ LocaleName: string;
446
+ VoiceType: string;
447
+ Status: string;
448
+ WordsPerMinute?: string;
449
+ };
450
+ export type GoogleAudioConfig = {
451
+ audioEncoding: string;
452
+ speakingRate?: number;
453
+ pitch?: number;
454
+ volumeGainDb?: number;
455
+ sampleRateHertz?: number;
456
+ effectsProfileId?: string[];
457
+ };
458
+ export type GoogleVoiceSelectionParams = {
459
+ languageCode: string;
460
+ name?: string;
461
+ ssmlGender?: string;
462
+ };
463
+ export type GoogleSynthesisInput = {
464
+ text?: string;
465
+ ssml?: string;
466
+ };
467
+ export type GoogleSynthesizeRequest = {
468
+ input: GoogleSynthesisInput;
469
+ voice: GoogleVoiceSelectionParams;
470
+ audioConfig: GoogleAudioConfig;
471
+ };
472
+ export type GoogleVoiceInfo = {
473
+ languageCodes: string[];
474
+ name: string;
475
+ ssmlGender: string;
476
+ naturalSampleRateHertz: number;
477
+ };
478
+ export type GoogleListVoicesResponse = {
479
+ voices: GoogleVoiceInfo[];
480
+ };
481
+ export type GoogleSynthesizeResponse = {
482
+ audioContent: string;
483
+ };
484
+ export type OpenAIRealtimeEvent = {
485
+ type: string;
486
+ event_id?: string;
487
+ [key: string]: unknown;
488
+ };
489
+ export type OpenAISessionCreated = OpenAIRealtimeEvent & {
490
+ type: "session.created";
491
+ session: {
492
+ id: string;
493
+ object: string;
494
+ model: string;
495
+ modalities: string[];
496
+ voice: string;
497
+ input_audio_format: string;
498
+ output_audio_format: string;
499
+ turn_detection: {
500
+ type: string;
501
+ threshold?: number;
502
+ prefix_padding_ms?: number;
503
+ silence_duration_ms?: number;
504
+ };
505
+ tools: unknown[];
506
+ tool_choice: string;
507
+ temperature: number;
508
+ max_response_output_tokens: string | number;
509
+ };
510
+ };
511
+ export type OpenAIAudioDelta = OpenAIRealtimeEvent & {
512
+ type: "response.audio.delta";
513
+ response_id: string;
514
+ item_id: string;
515
+ output_index: number;
516
+ content_index: number;
517
+ delta: string;
518
+ };
519
+ export type OpenAITranscriptDelta = OpenAIRealtimeEvent & {
520
+ type: "response.audio_transcript.delta" | "conversation.item.input_audio_transcription.completed";
521
+ delta?: string;
522
+ transcript?: string;
523
+ };
524
+ export type GeminiMessage = {
525
+ setup?: {
526
+ model: string;
527
+ generationConfig?: {
528
+ responseModalities?: string[];
529
+ speechConfig?: {
530
+ voiceConfig?: {
531
+ prebuiltVoiceConfig?: {
532
+ voiceName?: string;
533
+ };
534
+ };
535
+ };
536
+ };
537
+ systemInstruction?: {
538
+ parts: Array<{
539
+ text: string;
540
+ }>;
541
+ };
542
+ tools?: unknown[];
543
+ };
544
+ realtimeInput?: {
545
+ mediaChunks: Array<{
546
+ mimeType: string;
547
+ data: string;
548
+ }>;
549
+ };
550
+ clientContent?: {
551
+ turns: Array<{
552
+ role: string;
553
+ parts: Array<{
554
+ text: string;
555
+ }>;
556
+ }>;
557
+ turnComplete: boolean;
558
+ };
559
+ };
560
+ export type GeminiResponse = {
561
+ setupComplete?: Record<string, unknown>;
562
+ serverContent?: {
563
+ modelTurn?: {
564
+ parts: Array<{
565
+ text?: string;
566
+ inlineData?: {
567
+ mimeType: string;
568
+ data: string;
569
+ };
570
+ }>;
571
+ };
572
+ turnComplete?: boolean;
573
+ interrupted?: boolean;
574
+ };
575
+ toolCall?: {
576
+ functionCalls: Array<{
577
+ id: string;
578
+ name: string;
579
+ args: Record<string, unknown>;
580
+ }>;
581
+ };
582
+ toolCallCancellation?: {
583
+ ids: string[];
584
+ };
585
+ };
@@ -0,0 +1,90 @@
1
+ /**
2
+ * Speech-to-Text (STT) Type Definitions for NeuroLink
3
+ *
4
+ * All STT-specific types: options, results, handlers,
5
+ * provider-specific options, error codes, defaults, and type guards.
6
+ *
7
+ * @module types/stt
8
+ */
9
+ // ============================================================================
10
+ // STT ERROR CODES
11
+ // ============================================================================
12
+ /**
13
+ * STT error codes
14
+ */
15
+ export const STT_ERROR_CODES = {
16
+ AUDIO_EMPTY: "STT_AUDIO_EMPTY",
17
+ AUDIO_TOO_LONG: "STT_AUDIO_TOO_LONG",
18
+ INVALID_AUDIO_FORMAT: "STT_INVALID_AUDIO_FORMAT",
19
+ LANGUAGE_NOT_SUPPORTED: "STT_LANGUAGE_NOT_SUPPORTED",
20
+ TRANSCRIPTION_FAILED: "STT_TRANSCRIPTION_FAILED",
21
+ PROVIDER_NOT_CONFIGURED: "STT_PROVIDER_NOT_CONFIGURED",
22
+ PROVIDER_NOT_SUPPORTED: "STT_PROVIDER_NOT_SUPPORTED",
23
+ STREAM_ERROR: "STT_STREAM_ERROR",
24
+ STREAMING_NOT_SUPPORTED: "STT_STREAMING_NOT_SUPPORTED",
25
+ };
26
+ // ============================================================================
27
+ // STT DEFAULTS
28
+ // ============================================================================
29
+ /**
30
+ * Default STT options
31
+ */
32
+ export const DEFAULT_STT_OPTIONS = {
33
+ language: "en-US",
34
+ punctuation: true,
35
+ profanityFilter: false,
36
+ sampleRate: 16000,
37
+ };
38
+ // ============================================================================
39
+ // STT TYPE GUARDS
40
+ // ============================================================================
41
+ /**
42
+ * Type guard for STTResult
43
+ */
44
+ export function isSTTResult(value) {
45
+ if (!value || typeof value !== "object") {
46
+ return false;
47
+ }
48
+ const obj = value;
49
+ return (typeof obj.text === "string" &&
50
+ typeof obj.confidence === "number" &&
51
+ obj.confidence >= 0 &&
52
+ obj.confidence <= 1);
53
+ }
54
+ /**
55
+ * Type guard for valid STTOptions
56
+ */
57
+ export function isValidSTTOptions(options) {
58
+ if (!options || typeof options !== "object") {
59
+ return false;
60
+ }
61
+ const opts = options;
62
+ if (opts.sampleRate !== undefined) {
63
+ if (typeof opts.sampleRate !== "number" || opts.sampleRate <= 0) {
64
+ return false;
65
+ }
66
+ }
67
+ if (opts.speakerCount !== undefined) {
68
+ if (typeof opts.speakerCount !== "number" ||
69
+ opts.speakerCount < 1 ||
70
+ opts.speakerCount > 10) {
71
+ return false;
72
+ }
73
+ }
74
+ return true;
75
+ }
76
+ /**
77
+ * Type guard for TranscriptionSegment
78
+ */
79
+ export function isTranscriptionSegment(value) {
80
+ if (!value || typeof value !== "object") {
81
+ return false;
82
+ }
83
+ const obj = value;
84
+ // `index` is optional on the type — accept either undefined or a number,
85
+ // otherwise valid segments without an index would fail the guard.
86
+ return ((obj.index === undefined || typeof obj.index === "number") &&
87
+ typeof obj.text === "string" &&
88
+ typeof obj.isFinal === "boolean");
89
+ }
90
+ //# sourceMappingURL=stt.js.map
@@ -9,6 +9,7 @@ import type { StandardRecord, StringArray, ZodUnknownSchema } from "./aliases.js
9
9
  import type { ValidationError } from "../utils/parameterValidation.js";
10
10
  import type { MCPToolAnnotations } from "./mcp.js";
11
11
  import type { Logger } from "./utilities.js";
12
+ import type { HITLExecutionState } from "./hitl.js";
12
13
  /**
13
14
  * Commonly used Zod schema type aliases for cleaner type declarations
14
15
  */
@@ -48,6 +49,7 @@ export type ExecutionContext<T = StandardRecord> = {
48
49
  timeoutMs?: number;
49
50
  maxRetries?: number;
50
51
  startTime?: number;
52
+ hitlState?: HITLExecutionState;
51
53
  };
52
54
  /**
53
55
  * Cache configuration options