@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -0,0 +1,585 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech-to-Text (STT) Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* All STT-specific types: options, results, handlers,
|
|
5
|
+
* provider-specific options, error codes, defaults, and type guards.
|
|
6
|
+
*
|
|
7
|
+
* @module types/stt
|
|
8
|
+
*/
|
|
9
|
+
import type { TTSAudioFormat } from "./tts.js";
|
|
10
|
+
/**
|
|
11
|
+
* STT configuration options
|
|
12
|
+
*/
|
|
13
|
+
export type STTOptions = {
|
|
14
|
+
/** Enable STT processing */
|
|
15
|
+
enabled?: boolean;
|
|
16
|
+
/** Override STT provider */
|
|
17
|
+
provider?: string;
|
|
18
|
+
/** Language code for transcription (e.g., "en-US") */
|
|
19
|
+
language?: string;
|
|
20
|
+
/** Audio format of input */
|
|
21
|
+
format?: TTSAudioFormat;
|
|
22
|
+
/** Sample rate in Hz */
|
|
23
|
+
sampleRate?: number;
|
|
24
|
+
/** Enable punctuation in transcription */
|
|
25
|
+
punctuation?: boolean;
|
|
26
|
+
/** Enable punctuation (alias) */
|
|
27
|
+
punctuate?: boolean;
|
|
28
|
+
/** Enable profanity filter */
|
|
29
|
+
profanityFilter?: boolean;
|
|
30
|
+
/** Enable speaker diarization */
|
|
31
|
+
speakerDiarization?: boolean;
|
|
32
|
+
/** Enable speaker diarization (alias) */
|
|
33
|
+
diarization?: boolean;
|
|
34
|
+
/** Number of speakers (for diarization) */
|
|
35
|
+
speakerCount?: number;
|
|
36
|
+
/** Enable word-level timestamps */
|
|
37
|
+
wordTimestamps?: boolean;
|
|
38
|
+
/** Model variant to use */
|
|
39
|
+
model?: string;
|
|
40
|
+
/** Custom vocabulary/phrases */
|
|
41
|
+
vocabulary?: string[];
|
|
42
|
+
/** Minimum confidence threshold */
|
|
43
|
+
confidenceThreshold?: number;
|
|
44
|
+
/**
|
|
45
|
+
* Maximum audio buffer size in bytes. STTProcessor rejects buffers over
|
|
46
|
+
* this limit before any provider call, preventing OOM on multi-GB inputs.
|
|
47
|
+
* Default: 25_000_000 (matches Whisper's documented 25MB ceiling).
|
|
48
|
+
*/
|
|
49
|
+
maxAudioBytes?: number;
|
|
50
|
+
};
|
|
51
|
+
/**
|
|
52
|
+
* STT result from transcription
|
|
53
|
+
*/
|
|
54
|
+
export type STTResult = {
|
|
55
|
+
/** Full transcribed text */
|
|
56
|
+
text: string;
|
|
57
|
+
/** Confidence score (0-1) */
|
|
58
|
+
confidence: number;
|
|
59
|
+
/** Detected language code */
|
|
60
|
+
language?: string;
|
|
61
|
+
/** Audio duration in seconds */
|
|
62
|
+
duration?: number;
|
|
63
|
+
/** Word-level timings */
|
|
64
|
+
words?: WordTiming[];
|
|
65
|
+
/** Transcription segments */
|
|
66
|
+
segments?: TranscriptionSegment[];
|
|
67
|
+
/** Speaker labels (for diarization) */
|
|
68
|
+
speakers?: string[];
|
|
69
|
+
/** Performance metadata */
|
|
70
|
+
metadata?: {
|
|
71
|
+
/** Processing latency in milliseconds */
|
|
72
|
+
latency: number;
|
|
73
|
+
/** Provider name */
|
|
74
|
+
provider?: string;
|
|
75
|
+
/** Model used */
|
|
76
|
+
model?: string;
|
|
77
|
+
/** Additional provider-specific metadata */
|
|
78
|
+
[key: string]: unknown;
|
|
79
|
+
};
|
|
80
|
+
};
|
|
81
|
+
/**
|
|
82
|
+
* STT language information
|
|
83
|
+
*/
|
|
84
|
+
export type STTLanguage = {
|
|
85
|
+
/** Language code (e.g., "en-US") */
|
|
86
|
+
code: string;
|
|
87
|
+
/** Language name */
|
|
88
|
+
name: string;
|
|
89
|
+
/** Whether the language supports speaker diarization */
|
|
90
|
+
supportsDiarization?: boolean;
|
|
91
|
+
/** Whether the language supports punctuation */
|
|
92
|
+
supportsPunctuation?: boolean;
|
|
93
|
+
};
|
|
94
|
+
/**
|
|
95
|
+
* Word-level timing information
|
|
96
|
+
*/
|
|
97
|
+
export type WordTiming = {
|
|
98
|
+
/** The word */
|
|
99
|
+
word: string;
|
|
100
|
+
/** Start time in seconds */
|
|
101
|
+
startTime?: number;
|
|
102
|
+
/** Start time alias */
|
|
103
|
+
start?: number;
|
|
104
|
+
/** End time in seconds */
|
|
105
|
+
endTime?: number;
|
|
106
|
+
/** End time alias */
|
|
107
|
+
end?: number;
|
|
108
|
+
/** Confidence score (0-1) */
|
|
109
|
+
confidence?: number;
|
|
110
|
+
/** Speaker label (for diarization) */
|
|
111
|
+
speaker?: string;
|
|
112
|
+
};
|
|
113
|
+
/**
|
|
114
|
+
* Transcription segment for streaming STT
|
|
115
|
+
*/
|
|
116
|
+
export type TranscriptionSegment = {
|
|
117
|
+
/** Segment index */
|
|
118
|
+
index?: number;
|
|
119
|
+
/** Transcribed text */
|
|
120
|
+
text: string;
|
|
121
|
+
/** Whether this is a final result */
|
|
122
|
+
isFinal: boolean;
|
|
123
|
+
/** Confidence score (0-1) */
|
|
124
|
+
confidence?: number;
|
|
125
|
+
/** Start time in audio (seconds) */
|
|
126
|
+
startTime?: number;
|
|
127
|
+
/** Start time (alias for startTime) */
|
|
128
|
+
start?: number;
|
|
129
|
+
/** End time in audio (seconds) */
|
|
130
|
+
endTime?: number;
|
|
131
|
+
/** End time (alias for endTime) */
|
|
132
|
+
end?: number;
|
|
133
|
+
/** Word-level timings */
|
|
134
|
+
words?: WordTiming[];
|
|
135
|
+
/** Speaker label */
|
|
136
|
+
speaker?: string;
|
|
137
|
+
/** Detected language */
|
|
138
|
+
language?: string;
|
|
139
|
+
};
|
|
140
|
+
export type STTHandler = {
|
|
141
|
+
transcribe(audio: Buffer | ArrayBuffer, options: STTOptions): Promise<STTResult>;
|
|
142
|
+
transcribeStream?(audioStream: AsyncIterable<Buffer>, options: STTOptions): AsyncIterable<TranscriptionSegment>;
|
|
143
|
+
getSupportedLanguages?(): Promise<STTLanguage[]>;
|
|
144
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
145
|
+
isConfigured(): boolean;
|
|
146
|
+
maxAudioDuration?: number;
|
|
147
|
+
supportsStreaming?: boolean;
|
|
148
|
+
};
|
|
149
|
+
/**
|
|
150
|
+
* STT error codes
|
|
151
|
+
*/
|
|
152
|
+
export declare const STT_ERROR_CODES: {
|
|
153
|
+
readonly AUDIO_EMPTY: "STT_AUDIO_EMPTY";
|
|
154
|
+
readonly AUDIO_TOO_LONG: "STT_AUDIO_TOO_LONG";
|
|
155
|
+
readonly INVALID_AUDIO_FORMAT: "STT_INVALID_AUDIO_FORMAT";
|
|
156
|
+
readonly LANGUAGE_NOT_SUPPORTED: "STT_LANGUAGE_NOT_SUPPORTED";
|
|
157
|
+
readonly TRANSCRIPTION_FAILED: "STT_TRANSCRIPTION_FAILED";
|
|
158
|
+
readonly PROVIDER_NOT_CONFIGURED: "STT_PROVIDER_NOT_CONFIGURED";
|
|
159
|
+
readonly PROVIDER_NOT_SUPPORTED: "STT_PROVIDER_NOT_SUPPORTED";
|
|
160
|
+
readonly STREAM_ERROR: "STT_STREAM_ERROR";
|
|
161
|
+
readonly STREAMING_NOT_SUPPORTED: "STT_STREAMING_NOT_SUPPORTED";
|
|
162
|
+
};
|
|
163
|
+
/**
|
|
164
|
+
* Default STT options
|
|
165
|
+
*/
|
|
166
|
+
export declare const DEFAULT_STT_OPTIONS: Required<Pick<STTOptions, "language" | "punctuation" | "profanityFilter" | "sampleRate">>;
|
|
167
|
+
/**
|
|
168
|
+
* Type guard for STTResult
|
|
169
|
+
*/
|
|
170
|
+
export declare function isSTTResult(value: unknown): value is STTResult;
|
|
171
|
+
/**
|
|
172
|
+
* Type guard for valid STTOptions
|
|
173
|
+
*/
|
|
174
|
+
export declare function isValidSTTOptions(options: unknown): options is STTOptions;
|
|
175
|
+
/**
|
|
176
|
+
* Type guard for TranscriptionSegment
|
|
177
|
+
*/
|
|
178
|
+
export declare function isTranscriptionSegment(value: unknown): value is TranscriptionSegment;
|
|
179
|
+
export type AzureRecognitionMode = "interactive" | "conversation" | "dictation";
|
|
180
|
+
export type AzureOutputFormat = "simple" | "detailed";
|
|
181
|
+
export type AzureSTTOptions = STTOptions & {
|
|
182
|
+
recognitionMode?: AzureRecognitionMode;
|
|
183
|
+
outputFormat?: AzureOutputFormat;
|
|
184
|
+
interimResults?: boolean;
|
|
185
|
+
endpointId?: string;
|
|
186
|
+
/** Custom endpoint ID (alias for endpointId) */
|
|
187
|
+
customEndpointId?: string;
|
|
188
|
+
connectionTimeout?: number;
|
|
189
|
+
silenceTimeout?: number;
|
|
190
|
+
profanityOption?: "masked" | "removed" | "raw";
|
|
191
|
+
/** Profanity mode (alias for profanityOption) */
|
|
192
|
+
profanityMode?: "masked" | "removed" | "raw";
|
|
193
|
+
initialSilenceTimeout?: number;
|
|
194
|
+
enableLogging?: boolean;
|
|
195
|
+
phraseList?: string[];
|
|
196
|
+
/** Whether to request detailed output format */
|
|
197
|
+
detailed?: boolean;
|
|
198
|
+
wordLevelConfidence?: boolean;
|
|
199
|
+
initialSilenceTimeoutMs?: number;
|
|
200
|
+
endSilenceTimeoutMs?: number;
|
|
201
|
+
};
|
|
202
|
+
export type DeepgramModel = "nova-2" | "nova-2-general" | "nova-2-meeting" | "nova-2-phonecall" | "nova-2-voicemail" | "nova-2-finance" | "nova-2-medical" | "nova" | "enhanced" | "base";
|
|
203
|
+
export type DeepgramSTTOptions = STTOptions & {
|
|
204
|
+
model?: DeepgramModel | "nova-3";
|
|
205
|
+
smartFormat?: boolean;
|
|
206
|
+
search?: string[];
|
|
207
|
+
replace?: Array<{
|
|
208
|
+
find: string;
|
|
209
|
+
replace: string;
|
|
210
|
+
}>;
|
|
211
|
+
utterances?: boolean;
|
|
212
|
+
utterSplit?: number;
|
|
213
|
+
/** Alias for utterSplit (legacy field name) */
|
|
214
|
+
uttSplit?: number;
|
|
215
|
+
paragraphs?: boolean;
|
|
216
|
+
keywords?: string[];
|
|
217
|
+
keywordBoost?: "legacy" | "medium" | "high";
|
|
218
|
+
fillerWords?: boolean;
|
|
219
|
+
detectTopics?: boolean;
|
|
220
|
+
detectEntities?: boolean;
|
|
221
|
+
summarize?: boolean;
|
|
222
|
+
redact?: ("pci" | "numbers" | "ssn")[];
|
|
223
|
+
};
|
|
224
|
+
export type GoogleSTTModel = "latest_short" | "latest_long" | "telephony" | "medical_conversation" | "medical_dictation" | "command_and_search" | "phone_call" | "video" | "default";
|
|
225
|
+
export type GoogleSTTAudioEncoding = "ENCODING_UNSPECIFIED" | "LINEAR16" | "FLAC" | "MULAW" | "AMR" | "AMR_WB" | "OGG_OPUS" | "SPEEX_WITH_HEADER_BYTE" | "MP3" | "WEBM_OPUS";
|
|
226
|
+
export type GoogleSTTOptions = STTOptions & {
|
|
227
|
+
model?: GoogleSTTModel;
|
|
228
|
+
encoding?: GoogleSTTAudioEncoding;
|
|
229
|
+
sampleRateHertz?: number;
|
|
230
|
+
audioChannelCount?: number;
|
|
231
|
+
enableSeparateRecognitionPerChannel?: boolean;
|
|
232
|
+
alternativeLanguageCodes?: string[];
|
|
233
|
+
maxAlternatives?: number;
|
|
234
|
+
enableAutomaticPunctuation?: boolean;
|
|
235
|
+
enableSpokenPunctuation?: boolean;
|
|
236
|
+
enableSpokenEmojis?: boolean;
|
|
237
|
+
speechContexts?: Array<{
|
|
238
|
+
phrases: string[];
|
|
239
|
+
boost?: number;
|
|
240
|
+
}>;
|
|
241
|
+
adaptation?: {
|
|
242
|
+
phraseSets?: string[];
|
|
243
|
+
customClasses?: string[];
|
|
244
|
+
};
|
|
245
|
+
useEnhanced?: boolean;
|
|
246
|
+
keywords?: string[];
|
|
247
|
+
};
|
|
248
|
+
export type WhisperModel = "whisper-1";
|
|
249
|
+
export type WhisperSTTOptions = STTOptions & {
|
|
250
|
+
model?: WhisperModel;
|
|
251
|
+
responseFormat?: "json" | "text" | "srt" | "verbose_json" | "vtt";
|
|
252
|
+
temperature?: number;
|
|
253
|
+
prompt?: string;
|
|
254
|
+
/** Translate audio to English instead of transcribing in original language */
|
|
255
|
+
translate?: boolean;
|
|
256
|
+
};
|
|
257
|
+
export type AzureWord = {
|
|
258
|
+
Word: string;
|
|
259
|
+
Offset: number;
|
|
260
|
+
Duration: number;
|
|
261
|
+
Confidence?: number;
|
|
262
|
+
};
|
|
263
|
+
export type AzureNBest = {
|
|
264
|
+
Confidence: number;
|
|
265
|
+
Lexical: string;
|
|
266
|
+
ITN: string;
|
|
267
|
+
MaskedITN: string;
|
|
268
|
+
Display: string;
|
|
269
|
+
Words?: AzureWord[];
|
|
270
|
+
};
|
|
271
|
+
export type AzureRecognitionResult = {
|
|
272
|
+
RecognitionStatus: "Success" | "NoMatch" | "InitialSilenceTimeout" | "BabbleTimeout" | "Error" | string;
|
|
273
|
+
Offset?: number;
|
|
274
|
+
Duration?: number;
|
|
275
|
+
DisplayText?: string;
|
|
276
|
+
NBest?: AzureNBest[];
|
|
277
|
+
};
|
|
278
|
+
export type AzureSpeakerRecognitionResult = AzureRecognitionResult & {
|
|
279
|
+
SpeakerId?: string;
|
|
280
|
+
};
|
|
281
|
+
export type DeepgramWord = {
|
|
282
|
+
word: string;
|
|
283
|
+
start: number;
|
|
284
|
+
end: number;
|
|
285
|
+
confidence: number;
|
|
286
|
+
speaker?: number;
|
|
287
|
+
punctuated_word?: string;
|
|
288
|
+
};
|
|
289
|
+
export type DeepgramAlternative = {
|
|
290
|
+
transcript: string;
|
|
291
|
+
confidence: number;
|
|
292
|
+
words: DeepgramWord[];
|
|
293
|
+
paragraphs?: {
|
|
294
|
+
transcript: string;
|
|
295
|
+
paragraphs: Array<{
|
|
296
|
+
sentences: Array<{
|
|
297
|
+
text: string;
|
|
298
|
+
start: number;
|
|
299
|
+
end: number;
|
|
300
|
+
}>;
|
|
301
|
+
}>;
|
|
302
|
+
};
|
|
303
|
+
};
|
|
304
|
+
export type DeepgramChannel = {
|
|
305
|
+
alternatives: DeepgramAlternative[];
|
|
306
|
+
};
|
|
307
|
+
export type DeepgramUtterance = {
|
|
308
|
+
start: number;
|
|
309
|
+
end: number;
|
|
310
|
+
confidence: number;
|
|
311
|
+
channel: number;
|
|
312
|
+
transcript: string;
|
|
313
|
+
words: DeepgramWord[];
|
|
314
|
+
speaker?: number;
|
|
315
|
+
id?: string;
|
|
316
|
+
};
|
|
317
|
+
export type DeepgramResult = {
|
|
318
|
+
channels: DeepgramChannel[];
|
|
319
|
+
utterances?: DeepgramUtterance[];
|
|
320
|
+
};
|
|
321
|
+
export type DeepgramResponse = {
|
|
322
|
+
metadata: {
|
|
323
|
+
request_id: string;
|
|
324
|
+
transaction_key?: string;
|
|
325
|
+
sha256?: string;
|
|
326
|
+
created: string;
|
|
327
|
+
duration: number;
|
|
328
|
+
channels: number;
|
|
329
|
+
models: string[];
|
|
330
|
+
model_info?: Record<string, {
|
|
331
|
+
name: string;
|
|
332
|
+
version: string;
|
|
333
|
+
}>;
|
|
334
|
+
};
|
|
335
|
+
results: DeepgramResult;
|
|
336
|
+
};
|
|
337
|
+
export type GoogleWordInfo = {
|
|
338
|
+
startTime: string;
|
|
339
|
+
endTime: string;
|
|
340
|
+
word: string;
|
|
341
|
+
confidence?: number;
|
|
342
|
+
speakerTag?: number;
|
|
343
|
+
};
|
|
344
|
+
export type GoogleSpeechRecognitionAlternative = {
|
|
345
|
+
transcript: string;
|
|
346
|
+
confidence: number;
|
|
347
|
+
words?: GoogleWordInfo[];
|
|
348
|
+
};
|
|
349
|
+
export type GoogleSpeechRecognitionResult = {
|
|
350
|
+
alternatives: GoogleSpeechRecognitionAlternative[];
|
|
351
|
+
channelTag?: number;
|
|
352
|
+
languageCode?: string;
|
|
353
|
+
resultEndTime?: string;
|
|
354
|
+
};
|
|
355
|
+
export type GoogleLongRunningRecognizeResponse = {
|
|
356
|
+
results: GoogleSpeechRecognitionResult[];
|
|
357
|
+
totalBilledTime?: string;
|
|
358
|
+
};
|
|
359
|
+
export type GoogleRecognizeResponse = {
|
|
360
|
+
results?: GoogleSpeechRecognitionResult[];
|
|
361
|
+
totalBilledTime?: string;
|
|
362
|
+
};
|
|
363
|
+
export type GoogleOperationResponse = {
|
|
364
|
+
name: string;
|
|
365
|
+
done: boolean;
|
|
366
|
+
metadata?: {
|
|
367
|
+
progressPercent?: number;
|
|
368
|
+
startTime?: string;
|
|
369
|
+
lastUpdateTime?: string;
|
|
370
|
+
};
|
|
371
|
+
response?: GoogleLongRunningRecognizeResponse;
|
|
372
|
+
error?: {
|
|
373
|
+
code: number;
|
|
374
|
+
message: string;
|
|
375
|
+
};
|
|
376
|
+
};
|
|
377
|
+
export type GoogleRecognitionConfig = {
|
|
378
|
+
encoding: string;
|
|
379
|
+
sampleRateHertz?: number;
|
|
380
|
+
languageCode: string;
|
|
381
|
+
enableAutomaticPunctuation?: boolean;
|
|
382
|
+
enableWordTimeOffsets?: boolean;
|
|
383
|
+
enableWordConfidence?: boolean;
|
|
384
|
+
model?: string;
|
|
385
|
+
useEnhanced?: boolean;
|
|
386
|
+
maxAlternatives?: number;
|
|
387
|
+
profanityFilter?: boolean;
|
|
388
|
+
enableSpeakerDiarization?: boolean;
|
|
389
|
+
diarizationSpeakerCount?: number;
|
|
390
|
+
};
|
|
391
|
+
export type GoogleRecognitionAudio = {
|
|
392
|
+
content: string;
|
|
393
|
+
};
|
|
394
|
+
export type WhisperTranscriptionWord = {
|
|
395
|
+
word: string;
|
|
396
|
+
start: number;
|
|
397
|
+
end: number;
|
|
398
|
+
};
|
|
399
|
+
export type WhisperTranscriptionSegment = {
|
|
400
|
+
id: number;
|
|
401
|
+
seek: number;
|
|
402
|
+
start: number;
|
|
403
|
+
end: number;
|
|
404
|
+
text: string;
|
|
405
|
+
tokens: number[];
|
|
406
|
+
temperature: number;
|
|
407
|
+
avg_logprob: number;
|
|
408
|
+
compression_ratio: number;
|
|
409
|
+
no_speech_prob: number;
|
|
410
|
+
};
|
|
411
|
+
export type WhisperVerboseResponse = {
|
|
412
|
+
task: string;
|
|
413
|
+
language: string;
|
|
414
|
+
duration: number;
|
|
415
|
+
text: string;
|
|
416
|
+
segments?: WhisperTranscriptionSegment[];
|
|
417
|
+
words?: WhisperTranscriptionWord[];
|
|
418
|
+
};
|
|
419
|
+
export type WhisperSimpleResponse = {
|
|
420
|
+
text: string;
|
|
421
|
+
};
|
|
422
|
+
export type ElevenLabsVoice = {
|
|
423
|
+
voice_id: string;
|
|
424
|
+
name: string;
|
|
425
|
+
category: string;
|
|
426
|
+
labels?: {
|
|
427
|
+
accent?: string;
|
|
428
|
+
description?: string;
|
|
429
|
+
age?: string;
|
|
430
|
+
gender?: string;
|
|
431
|
+
use_case?: string;
|
|
432
|
+
};
|
|
433
|
+
preview_url?: string;
|
|
434
|
+
};
|
|
435
|
+
export type ElevenLabsVoicesResponse = {
|
|
436
|
+
voices: ElevenLabsVoice[];
|
|
437
|
+
};
|
|
438
|
+
export type AzureVoiceInfo = {
|
|
439
|
+
Name: string;
|
|
440
|
+
DisplayName: string;
|
|
441
|
+
LocalName: string;
|
|
442
|
+
ShortName: string;
|
|
443
|
+
Gender: string;
|
|
444
|
+
Locale: string;
|
|
445
|
+
LocaleName: string;
|
|
446
|
+
VoiceType: string;
|
|
447
|
+
Status: string;
|
|
448
|
+
WordsPerMinute?: string;
|
|
449
|
+
};
|
|
450
|
+
export type GoogleAudioConfig = {
|
|
451
|
+
audioEncoding: string;
|
|
452
|
+
speakingRate?: number;
|
|
453
|
+
pitch?: number;
|
|
454
|
+
volumeGainDb?: number;
|
|
455
|
+
sampleRateHertz?: number;
|
|
456
|
+
effectsProfileId?: string[];
|
|
457
|
+
};
|
|
458
|
+
export type GoogleVoiceSelectionParams = {
|
|
459
|
+
languageCode: string;
|
|
460
|
+
name?: string;
|
|
461
|
+
ssmlGender?: string;
|
|
462
|
+
};
|
|
463
|
+
export type GoogleSynthesisInput = {
|
|
464
|
+
text?: string;
|
|
465
|
+
ssml?: string;
|
|
466
|
+
};
|
|
467
|
+
export type GoogleSynthesizeRequest = {
|
|
468
|
+
input: GoogleSynthesisInput;
|
|
469
|
+
voice: GoogleVoiceSelectionParams;
|
|
470
|
+
audioConfig: GoogleAudioConfig;
|
|
471
|
+
};
|
|
472
|
+
export type GoogleVoiceInfo = {
|
|
473
|
+
languageCodes: string[];
|
|
474
|
+
name: string;
|
|
475
|
+
ssmlGender: string;
|
|
476
|
+
naturalSampleRateHertz: number;
|
|
477
|
+
};
|
|
478
|
+
export type GoogleListVoicesResponse = {
|
|
479
|
+
voices: GoogleVoiceInfo[];
|
|
480
|
+
};
|
|
481
|
+
export type GoogleSynthesizeResponse = {
|
|
482
|
+
audioContent: string;
|
|
483
|
+
};
|
|
484
|
+
export type OpenAIRealtimeEvent = {
|
|
485
|
+
type: string;
|
|
486
|
+
event_id?: string;
|
|
487
|
+
[key: string]: unknown;
|
|
488
|
+
};
|
|
489
|
+
export type OpenAISessionCreated = OpenAIRealtimeEvent & {
|
|
490
|
+
type: "session.created";
|
|
491
|
+
session: {
|
|
492
|
+
id: string;
|
|
493
|
+
object: string;
|
|
494
|
+
model: string;
|
|
495
|
+
modalities: string[];
|
|
496
|
+
voice: string;
|
|
497
|
+
input_audio_format: string;
|
|
498
|
+
output_audio_format: string;
|
|
499
|
+
turn_detection: {
|
|
500
|
+
type: string;
|
|
501
|
+
threshold?: number;
|
|
502
|
+
prefix_padding_ms?: number;
|
|
503
|
+
silence_duration_ms?: number;
|
|
504
|
+
};
|
|
505
|
+
tools: unknown[];
|
|
506
|
+
tool_choice: string;
|
|
507
|
+
temperature: number;
|
|
508
|
+
max_response_output_tokens: string | number;
|
|
509
|
+
};
|
|
510
|
+
};
|
|
511
|
+
export type OpenAIAudioDelta = OpenAIRealtimeEvent & {
|
|
512
|
+
type: "response.audio.delta";
|
|
513
|
+
response_id: string;
|
|
514
|
+
item_id: string;
|
|
515
|
+
output_index: number;
|
|
516
|
+
content_index: number;
|
|
517
|
+
delta: string;
|
|
518
|
+
};
|
|
519
|
+
export type OpenAITranscriptDelta = OpenAIRealtimeEvent & {
|
|
520
|
+
type: "response.audio_transcript.delta" | "conversation.item.input_audio_transcription.completed";
|
|
521
|
+
delta?: string;
|
|
522
|
+
transcript?: string;
|
|
523
|
+
};
|
|
524
|
+
export type GeminiMessage = {
|
|
525
|
+
setup?: {
|
|
526
|
+
model: string;
|
|
527
|
+
generationConfig?: {
|
|
528
|
+
responseModalities?: string[];
|
|
529
|
+
speechConfig?: {
|
|
530
|
+
voiceConfig?: {
|
|
531
|
+
prebuiltVoiceConfig?: {
|
|
532
|
+
voiceName?: string;
|
|
533
|
+
};
|
|
534
|
+
};
|
|
535
|
+
};
|
|
536
|
+
};
|
|
537
|
+
systemInstruction?: {
|
|
538
|
+
parts: Array<{
|
|
539
|
+
text: string;
|
|
540
|
+
}>;
|
|
541
|
+
};
|
|
542
|
+
tools?: unknown[];
|
|
543
|
+
};
|
|
544
|
+
realtimeInput?: {
|
|
545
|
+
mediaChunks: Array<{
|
|
546
|
+
mimeType: string;
|
|
547
|
+
data: string;
|
|
548
|
+
}>;
|
|
549
|
+
};
|
|
550
|
+
clientContent?: {
|
|
551
|
+
turns: Array<{
|
|
552
|
+
role: string;
|
|
553
|
+
parts: Array<{
|
|
554
|
+
text: string;
|
|
555
|
+
}>;
|
|
556
|
+
}>;
|
|
557
|
+
turnComplete: boolean;
|
|
558
|
+
};
|
|
559
|
+
};
|
|
560
|
+
export type GeminiResponse = {
|
|
561
|
+
setupComplete?: Record<string, unknown>;
|
|
562
|
+
serverContent?: {
|
|
563
|
+
modelTurn?: {
|
|
564
|
+
parts: Array<{
|
|
565
|
+
text?: string;
|
|
566
|
+
inlineData?: {
|
|
567
|
+
mimeType: string;
|
|
568
|
+
data: string;
|
|
569
|
+
};
|
|
570
|
+
}>;
|
|
571
|
+
};
|
|
572
|
+
turnComplete?: boolean;
|
|
573
|
+
interrupted?: boolean;
|
|
574
|
+
};
|
|
575
|
+
toolCall?: {
|
|
576
|
+
functionCalls: Array<{
|
|
577
|
+
id: string;
|
|
578
|
+
name: string;
|
|
579
|
+
args: Record<string, unknown>;
|
|
580
|
+
}>;
|
|
581
|
+
};
|
|
582
|
+
toolCallCancellation?: {
|
|
583
|
+
ids: string[];
|
|
584
|
+
};
|
|
585
|
+
};
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Speech-to-Text (STT) Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* All STT-specific types: options, results, handlers,
|
|
5
|
+
* provider-specific options, error codes, defaults, and type guards.
|
|
6
|
+
*
|
|
7
|
+
* @module types/stt
|
|
8
|
+
*/
|
|
9
|
+
// ============================================================================
|
|
10
|
+
// STT ERROR CODES
|
|
11
|
+
// ============================================================================
|
|
12
|
+
/**
|
|
13
|
+
* STT error codes
|
|
14
|
+
*/
|
|
15
|
+
export const STT_ERROR_CODES = {
|
|
16
|
+
AUDIO_EMPTY: "STT_AUDIO_EMPTY",
|
|
17
|
+
AUDIO_TOO_LONG: "STT_AUDIO_TOO_LONG",
|
|
18
|
+
INVALID_AUDIO_FORMAT: "STT_INVALID_AUDIO_FORMAT",
|
|
19
|
+
LANGUAGE_NOT_SUPPORTED: "STT_LANGUAGE_NOT_SUPPORTED",
|
|
20
|
+
TRANSCRIPTION_FAILED: "STT_TRANSCRIPTION_FAILED",
|
|
21
|
+
PROVIDER_NOT_CONFIGURED: "STT_PROVIDER_NOT_CONFIGURED",
|
|
22
|
+
PROVIDER_NOT_SUPPORTED: "STT_PROVIDER_NOT_SUPPORTED",
|
|
23
|
+
STREAM_ERROR: "STT_STREAM_ERROR",
|
|
24
|
+
STREAMING_NOT_SUPPORTED: "STT_STREAMING_NOT_SUPPORTED",
|
|
25
|
+
};
|
|
26
|
+
// ============================================================================
|
|
27
|
+
// STT DEFAULTS
|
|
28
|
+
// ============================================================================
|
|
29
|
+
/**
|
|
30
|
+
* Default STT options
|
|
31
|
+
*/
|
|
32
|
+
export const DEFAULT_STT_OPTIONS = {
|
|
33
|
+
language: "en-US",
|
|
34
|
+
punctuation: true,
|
|
35
|
+
profanityFilter: false,
|
|
36
|
+
sampleRate: 16000,
|
|
37
|
+
};
|
|
38
|
+
// ============================================================================
|
|
39
|
+
// STT TYPE GUARDS
|
|
40
|
+
// ============================================================================
|
|
41
|
+
/**
|
|
42
|
+
* Type guard for STTResult
|
|
43
|
+
*/
|
|
44
|
+
export function isSTTResult(value) {
|
|
45
|
+
if (!value || typeof value !== "object") {
|
|
46
|
+
return false;
|
|
47
|
+
}
|
|
48
|
+
const obj = value;
|
|
49
|
+
return (typeof obj.text === "string" &&
|
|
50
|
+
typeof obj.confidence === "number" &&
|
|
51
|
+
obj.confidence >= 0 &&
|
|
52
|
+
obj.confidence <= 1);
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* Type guard for valid STTOptions
|
|
56
|
+
*/
|
|
57
|
+
export function isValidSTTOptions(options) {
|
|
58
|
+
if (!options || typeof options !== "object") {
|
|
59
|
+
return false;
|
|
60
|
+
}
|
|
61
|
+
const opts = options;
|
|
62
|
+
if (opts.sampleRate !== undefined) {
|
|
63
|
+
if (typeof opts.sampleRate !== "number" || opts.sampleRate <= 0) {
|
|
64
|
+
return false;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if (opts.speakerCount !== undefined) {
|
|
68
|
+
if (typeof opts.speakerCount !== "number" ||
|
|
69
|
+
opts.speakerCount < 1 ||
|
|
70
|
+
opts.speakerCount > 10) {
|
|
71
|
+
return false;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
return true;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Type guard for TranscriptionSegment
|
|
78
|
+
*/
|
|
79
|
+
export function isTranscriptionSegment(value) {
|
|
80
|
+
if (!value || typeof value !== "object") {
|
|
81
|
+
return false;
|
|
82
|
+
}
|
|
83
|
+
const obj = value;
|
|
84
|
+
// `index` is optional on the type — accept either undefined or a number,
|
|
85
|
+
// otherwise valid segments without an index would fail the guard.
|
|
86
|
+
return ((obj.index === undefined || typeof obj.index === "number") &&
|
|
87
|
+
typeof obj.text === "string" &&
|
|
88
|
+
typeof obj.isFinal === "boolean");
|
|
89
|
+
}
|
package/dist/types/tools.d.ts
CHANGED
|
@@ -9,6 +9,7 @@ import type { StandardRecord, StringArray, ZodUnknownSchema } from "./aliases.js
|
|
|
9
9
|
import type { ValidationError } from "../utils/parameterValidation.js";
|
|
10
10
|
import type { MCPToolAnnotations } from "./mcp.js";
|
|
11
11
|
import type { Logger } from "./utilities.js";
|
|
12
|
+
import type { HITLExecutionState } from "./hitl.js";
|
|
12
13
|
/**
|
|
13
14
|
* Commonly used Zod schema type aliases for cleaner type declarations
|
|
14
15
|
*/
|
|
@@ -48,6 +49,7 @@ export type ExecutionContext<T = StandardRecord> = {
|
|
|
48
49
|
timeoutMs?: number;
|
|
49
50
|
maxRetries?: number;
|
|
50
51
|
startTime?: number;
|
|
52
|
+
hitlState?: HITLExecutionState;
|
|
51
53
|
};
|
|
52
54
|
/**
|
|
53
55
|
* Cache configuration options
|