@juspay/neurolink 9.61.1 → 9.62.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +23 -17
- package/dist/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/browser/neurolink.min.js +382 -364
- package/dist/cli/commands/serve.js +9 -0
- package/dist/cli/commands/voiceServer.d.ts +7 -0
- package/dist/cli/commands/voiceServer.js +9 -1
- package/dist/cli/factories/commandFactory.js +136 -11
- package/dist/cli/loop/optionsSchema.d.ts +1 -1
- package/dist/cli/utils/audioFileUtils.d.ts +3 -3
- package/dist/cli/utils/audioFileUtils.js +5 -1
- package/dist/core/baseProvider.js +29 -6
- package/dist/factories/providerRegistry.d.ts +14 -0
- package/dist/factories/providerRegistry.js +141 -2
- package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
- package/dist/lib/core/baseProvider.js +29 -6
- package/dist/lib/factories/providerRegistry.d.ts +14 -0
- package/dist/lib/factories/providerRegistry.js +141 -2
- package/dist/lib/mcp/toolRegistry.js +7 -1
- package/dist/lib/neurolink.d.ts +19 -0
- package/dist/lib/neurolink.js +252 -14
- package/dist/lib/observability/exporters/laminarExporter.js +1 -0
- package/dist/lib/observability/exporters/posthogExporter.js +1 -0
- package/dist/lib/observability/utils/spanSerializer.js +1 -0
- package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
- package/dist/lib/server/voice/tokenCompare.js +23 -0
- package/dist/lib/server/voice/voiceServerApp.js +62 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/lib/types/generate.d.ts +47 -0
- package/dist/lib/types/hitl.d.ts +3 -0
- package/dist/lib/types/index.d.ts +1 -1
- package/dist/lib/types/index.js +1 -1
- package/dist/lib/types/realtime.d.ts +243 -0
- package/dist/lib/types/realtime.js +70 -0
- package/dist/lib/types/server.d.ts +68 -0
- package/dist/lib/types/span.d.ts +2 -0
- package/dist/lib/types/span.js +2 -0
- package/dist/lib/types/stream.d.ts +36 -14
- package/dist/lib/types/stt.d.ts +585 -0
- package/dist/lib/types/stt.js +90 -0
- package/dist/lib/types/tools.d.ts +2 -0
- package/dist/lib/types/tts.d.ts +23 -11
- package/dist/lib/types/tts.js +7 -0
- package/dist/lib/types/voice.d.ts +272 -0
- package/dist/lib/types/voice.js +137 -0
- package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
- package/dist/lib/utils/audioFormatDetector.js +34 -0
- package/dist/lib/utils/errorHandling.js +4 -0
- package/dist/lib/utils/sttProcessor.d.ts +115 -0
- package/dist/lib/utils/sttProcessor.js +295 -0
- package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
- package/dist/lib/voice/audio-utils.d.ts +135 -0
- package/dist/lib/voice/audio-utils.js +435 -0
- package/dist/lib/voice/errors.d.ts +123 -0
- package/dist/lib/voice/errors.js +386 -0
- package/dist/lib/voice/index.d.ts +26 -0
- package/dist/lib/voice/index.js +55 -0
- package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/lib/voice/providers/AzureSTT.js +345 -0
- package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/lib/voice/providers/AzureTTS.js +349 -0
- package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
- package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/lib/voice/providers/GeminiLive.js +372 -0
- package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/lib/voice/providers/GoogleSTT.js +454 -0
- package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
- package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/lib/voice/providers/OpenAISTT.js +286 -0
- package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/lib/voice/providers/OpenAITTS.js +271 -0
- package/dist/lib/voice/stream-handler.d.ts +166 -0
- package/dist/lib/voice/stream-handler.js +514 -0
- package/dist/mcp/toolRegistry.js +7 -1
- package/dist/neurolink.d.ts +19 -0
- package/dist/neurolink.js +252 -14
- package/dist/observability/exporters/laminarExporter.js +1 -0
- package/dist/observability/exporters/posthogExporter.js +1 -0
- package/dist/observability/utils/spanSerializer.js +1 -0
- package/dist/server/voice/tokenCompare.d.ts +14 -0
- package/dist/server/voice/tokenCompare.js +22 -0
- package/dist/server/voice/voiceServerApp.js +62 -3
- package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
- package/dist/server/voice/voiceWebSocketHandler.js +555 -435
- package/dist/types/generate.d.ts +47 -0
- package/dist/types/hitl.d.ts +3 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/realtime.d.ts +243 -0
- package/dist/types/realtime.js +69 -0
- package/dist/types/server.d.ts +68 -0
- package/dist/types/span.d.ts +2 -0
- package/dist/types/span.js +2 -0
- package/dist/types/stream.d.ts +36 -14
- package/dist/types/stt.d.ts +585 -0
- package/dist/types/stt.js +89 -0
- package/dist/types/tools.d.ts +2 -0
- package/dist/types/tts.d.ts +23 -11
- package/dist/types/tts.js +7 -0
- package/dist/types/voice.d.ts +272 -0
- package/dist/types/voice.js +136 -0
- package/dist/utils/audioFormatDetector.d.ts +15 -0
- package/dist/utils/audioFormatDetector.js +33 -0
- package/dist/utils/errorHandling.js +4 -0
- package/dist/utils/sttProcessor.d.ts +115 -0
- package/dist/utils/sttProcessor.js +294 -0
- package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
- package/dist/voice/RealtimeVoiceAPI.js +438 -0
- package/dist/voice/audio-utils.d.ts +135 -0
- package/dist/voice/audio-utils.js +434 -0
- package/dist/voice/errors.d.ts +123 -0
- package/dist/voice/errors.js +385 -0
- package/dist/voice/index.d.ts +26 -0
- package/dist/voice/index.js +54 -0
- package/dist/voice/providers/AzureSTT.d.ts +47 -0
- package/dist/voice/providers/AzureSTT.js +344 -0
- package/dist/voice/providers/AzureTTS.d.ts +59 -0
- package/dist/voice/providers/AzureTTS.js +348 -0
- package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
- package/dist/voice/providers/DeepgramSTT.js +549 -0
- package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
- package/dist/voice/providers/ElevenLabsTTS.js +310 -0
- package/dist/voice/providers/GeminiLive.d.ts +52 -0
- package/dist/voice/providers/GeminiLive.js +371 -0
- package/dist/voice/providers/GoogleSTT.d.ts +60 -0
- package/dist/voice/providers/GoogleSTT.js +453 -0
- package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
- package/dist/voice/providers/OpenAIRealtime.js +411 -0
- package/dist/voice/providers/OpenAISTT.d.ts +41 -0
- package/dist/voice/providers/OpenAISTT.js +285 -0
- package/dist/voice/providers/OpenAITTS.d.ts +49 -0
- package/dist/voice/providers/OpenAITTS.js +270 -0
- package/dist/voice/stream-handler.d.ts +166 -0
- package/dist/voice/stream-handler.js +513 -0
- package/package.json +5 -2
|
@@ -10,6 +10,7 @@ import type { MiddlewareFactoryOptions, OnFinishCallback, OnErrorCallback } from
|
|
|
10
10
|
import type { DirectorModeOptions, DirectorSegment, VideoGenerationResult, VideoOutputOptions } from "./multimodal.js";
|
|
11
11
|
import type { PPTGenerationResult, PPTOutputOptions } from "./ppt.js";
|
|
12
12
|
import type { TTSOptions, TTSResult } from "./tts.js";
|
|
13
|
+
import type { STTOptions, STTResult } from "./stt.js";
|
|
13
14
|
import type { StandardRecord, ValidationSchema, ZodUnknownSchema } from "./aliases.js";
|
|
14
15
|
import type { NeurolinkCredentials } from "./providers.js";
|
|
15
16
|
import type { FileWithMetadata } from "./file.js";
|
|
@@ -142,6 +143,27 @@ export type GenerateOptions = {
|
|
|
142
143
|
* ```
|
|
143
144
|
*/
|
|
144
145
|
tts?: TTSOptions;
|
|
146
|
+
/**
|
|
147
|
+
* Speech-to-Text (STT) configuration
|
|
148
|
+
*
|
|
149
|
+
* Enable audio transcription. When enabled, the audio provided via `stt.audio`
|
|
150
|
+
* will be transcribed to text and used as the prompt.
|
|
151
|
+
*
|
|
152
|
+
* @example
|
|
153
|
+
* ```typescript
|
|
154
|
+
* const neurolink = new NeuroLink();
|
|
155
|
+
* const result = await neurolink.generate({
|
|
156
|
+
* input: { text: "" },
|
|
157
|
+
* provider: "openai",
|
|
158
|
+
* stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
|
|
159
|
+
* });
|
|
160
|
+
* // STT transcribes the audio, result.transcription contains the transcription
|
|
161
|
+
* ```
|
|
162
|
+
*/
|
|
163
|
+
stt?: STTOptions & {
|
|
164
|
+
provider?: string;
|
|
165
|
+
audio?: Buffer | ArrayBuffer;
|
|
166
|
+
};
|
|
145
167
|
/**
|
|
146
168
|
* Thinking/reasoning configuration for extended thinking models
|
|
147
169
|
*
|
|
@@ -660,6 +682,8 @@ export type GenerateResult = {
|
|
|
660
682
|
reasoning?: string;
|
|
661
683
|
/** Token count for reasoning content */
|
|
662
684
|
reasoningTokens?: number;
|
|
685
|
+
/** STT transcription result (present when stt.enabled is true and audio input was provided) */
|
|
686
|
+
transcription?: STTResult;
|
|
663
687
|
retries?: {
|
|
664
688
|
count: number;
|
|
665
689
|
errors: Array<{
|
|
@@ -868,6 +892,27 @@ export type TextGenerationOptions = {
|
|
|
868
892
|
* ```
|
|
869
893
|
*/
|
|
870
894
|
tts?: TTSOptions;
|
|
895
|
+
/**
|
|
896
|
+
* Speech-to-Text (STT) configuration
|
|
897
|
+
*
|
|
898
|
+
* Enable audio transcription. When enabled, the audio provided via `stt.audio`
|
|
899
|
+
* will be transcribed to text and used as the prompt.
|
|
900
|
+
*
|
|
901
|
+
* @example
|
|
902
|
+
* ```typescript
|
|
903
|
+
* const neurolink = new NeuroLink();
|
|
904
|
+
* const result = await neurolink.generate({
|
|
905
|
+
* input: { text: "" },
|
|
906
|
+
* provider: "openai",
|
|
907
|
+
* stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
|
|
908
|
+
* });
|
|
909
|
+
* // STT transcribes the audio, result.transcription contains the transcription
|
|
910
|
+
* ```
|
|
911
|
+
*/
|
|
912
|
+
stt?: STTOptions & {
|
|
913
|
+
provider?: string;
|
|
914
|
+
audio?: Buffer | ArrayBuffer;
|
|
915
|
+
};
|
|
871
916
|
enableEvaluation?: boolean;
|
|
872
917
|
enableAnalytics?: boolean;
|
|
873
918
|
context?: Record<string, JsonValue>;
|
|
@@ -1033,6 +1078,8 @@ export type TextGenerationResult = {
|
|
|
1033
1078
|
analytics?: AnalyticsData;
|
|
1034
1079
|
evaluation?: EvaluationData;
|
|
1035
1080
|
audio?: TTSResult;
|
|
1081
|
+
/** STT transcription result (present when stt input was processed) */
|
|
1082
|
+
transcription?: STTResult;
|
|
1036
1083
|
/** Video generation result */
|
|
1037
1084
|
video?: VideoGenerationResult;
|
|
1038
1085
|
/** PowerPoint generation result */
|
package/dist/lib/types/hitl.d.ts
CHANGED
|
@@ -48,7 +48,7 @@ export * from "./subscription.js";
|
|
|
48
48
|
export * from "./task.js";
|
|
49
49
|
export * from "./taskClassification.js";
|
|
50
50
|
export * from "./tools.js";
|
|
51
|
-
export * from "./
|
|
51
|
+
export * from "./voice.js";
|
|
52
52
|
export * from "./universalProviderOptions.js";
|
|
53
53
|
export * from "./utilities.js";
|
|
54
54
|
export * from "./workflow.js";
|
package/dist/lib/types/index.js
CHANGED
|
@@ -49,7 +49,7 @@ export * from "./subscription.js";
|
|
|
49
49
|
export * from "./task.js";
|
|
50
50
|
export * from "./taskClassification.js";
|
|
51
51
|
export * from "./tools.js";
|
|
52
|
-
export * from "./
|
|
52
|
+
export * from "./voice.js";
|
|
53
53
|
export * from "./universalProviderOptions.js";
|
|
54
54
|
export * from "./utilities.js";
|
|
55
55
|
export * from "./workflow.js";
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime Voice Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* All realtime/bidirectional voice types: session, config, messages,
|
|
5
|
+
* event handlers, provider types, handler types, error codes, defaults,
|
|
6
|
+
* and type guards.
|
|
7
|
+
*
|
|
8
|
+
* @module types/realtime
|
|
9
|
+
*/
|
|
10
|
+
import type { TTSAudioFormat } from "./tts.js";
|
|
11
|
+
type RealtimeProviderCapability = "tts" | "stt" | "realtime" | "streaming";
|
|
12
|
+
/**
|
|
13
|
+
* Realtime session state
|
|
14
|
+
*/
|
|
15
|
+
export type RealtimeSessionState = "disconnected" | "connecting" | "connected" | "disconnecting" | "error";
|
|
16
|
+
/**
|
|
17
|
+
* Realtime voice configuration
|
|
18
|
+
*/
|
|
19
|
+
export type RealtimeConfig = {
|
|
20
|
+
/**
|
|
21
|
+
* Provider to use. Must match the handler key registered with
|
|
22
|
+
* `RealtimeProcessor.registerHandler()` — currently `"openai-realtime"`
|
|
23
|
+
* (registered in `providerRegistry.ts`) and `"gemini-live"` (registered in
|
|
24
|
+
* `providerRegistry.ts`). Aliasing is handled at registry/CLI parse time,
|
|
25
|
+
* not here.
|
|
26
|
+
*/
|
|
27
|
+
provider: "openai-realtime" | "gemini-live";
|
|
28
|
+
/** API key */
|
|
29
|
+
apiKey?: string;
|
|
30
|
+
/** Model to use */
|
|
31
|
+
model?: string;
|
|
32
|
+
/** Voice for TTS output */
|
|
33
|
+
voice?: string;
|
|
34
|
+
/** Input language */
|
|
35
|
+
inputLanguage?: string;
|
|
36
|
+
/** Output language */
|
|
37
|
+
outputLanguage?: string;
|
|
38
|
+
/** System prompt for the AI */
|
|
39
|
+
systemPrompt?: string;
|
|
40
|
+
/** Session timeout in milliseconds */
|
|
41
|
+
timeout?: number;
|
|
42
|
+
/** Audio input format */
|
|
43
|
+
inputFormat?: TTSAudioFormat;
|
|
44
|
+
/** Audio output format */
|
|
45
|
+
outputFormat?: TTSAudioFormat;
|
|
46
|
+
/** Input sample rate */
|
|
47
|
+
inputSampleRate?: number;
|
|
48
|
+
/** Output sample rate */
|
|
49
|
+
outputSampleRate?: number;
|
|
50
|
+
/** Enable voice activity detection */
|
|
51
|
+
vadEnabled?: boolean;
|
|
52
|
+
/** VAD threshold (0-1) */
|
|
53
|
+
vadThreshold?: number;
|
|
54
|
+
/** Turn detection mode */
|
|
55
|
+
turnDetection?: "server_vad" | "manual";
|
|
56
|
+
/** Instructions/system prompt for the session */
|
|
57
|
+
instructions?: string;
|
|
58
|
+
/** Temperature for AI responses */
|
|
59
|
+
temperature?: number;
|
|
60
|
+
/** Tools/functions available to the model */
|
|
61
|
+
tools?: RealtimeTool[];
|
|
62
|
+
};
|
|
63
|
+
/**
|
|
64
|
+
* Realtime tool definition
|
|
65
|
+
*/
|
|
66
|
+
export type RealtimeTool = {
|
|
67
|
+
/** Tool name */
|
|
68
|
+
name: string;
|
|
69
|
+
/** Tool description */
|
|
70
|
+
description: string;
|
|
71
|
+
/** JSON schema for parameters */
|
|
72
|
+
parameters: Record<string, unknown>;
|
|
73
|
+
};
|
|
74
|
+
/**
|
|
75
|
+
* Realtime session information
|
|
76
|
+
*/
|
|
77
|
+
export type RealtimeSession = {
|
|
78
|
+
/** Session ID */
|
|
79
|
+
id: string;
|
|
80
|
+
/** Current state */
|
|
81
|
+
state: RealtimeSessionState;
|
|
82
|
+
/** Provider name — narrowed to the validated config provider union so
|
|
83
|
+
* session state stays aligned with what `connect()` accepts. */
|
|
84
|
+
provider: RealtimeConfig["provider"];
|
|
85
|
+
/** Model being used */
|
|
86
|
+
model?: string;
|
|
87
|
+
/** Session creation time */
|
|
88
|
+
createdAt: Date;
|
|
89
|
+
/** Last activity time */
|
|
90
|
+
lastActivityAt: Date;
|
|
91
|
+
/** Session configuration */
|
|
92
|
+
config: RealtimeConfig;
|
|
93
|
+
/** Check if session is open */
|
|
94
|
+
isOpen?: () => boolean;
|
|
95
|
+
/** Close the session */
|
|
96
|
+
close?: () => Promise<void>;
|
|
97
|
+
};
|
|
98
|
+
/**
|
|
99
|
+
* Realtime audio chunk
|
|
100
|
+
*/
|
|
101
|
+
export type RealtimeAudioChunk = {
|
|
102
|
+
/** Audio data */
|
|
103
|
+
data: Buffer;
|
|
104
|
+
/** Chunk sequence number */
|
|
105
|
+
index: number;
|
|
106
|
+
/** Whether this is the final chunk */
|
|
107
|
+
isFinal: boolean;
|
|
108
|
+
/** Audio format */
|
|
109
|
+
format: TTSAudioFormat;
|
|
110
|
+
/** Sample rate */
|
|
111
|
+
sampleRate?: number;
|
|
112
|
+
/** Duration of this chunk in milliseconds */
|
|
113
|
+
durationMs?: number;
|
|
114
|
+
};
|
|
115
|
+
/**
|
|
116
|
+
* Realtime message types
|
|
117
|
+
*/
|
|
118
|
+
export type RealtimeMessageType = "audio" | "text" | "transcript" | "function_call" | "function_result" | "error" | "session_update" | "turn_start" | "turn_end";
|
|
119
|
+
/**
|
|
120
|
+
* Realtime message
|
|
121
|
+
*/
|
|
122
|
+
export type RealtimeMessage = {
|
|
123
|
+
/** Message type */
|
|
124
|
+
type: RealtimeMessageType;
|
|
125
|
+
/** Message ID */
|
|
126
|
+
id?: string;
|
|
127
|
+
/** Audio data (for audio messages) */
|
|
128
|
+
audio?: RealtimeAudioChunk;
|
|
129
|
+
/** Text content (for text/transcript messages) */
|
|
130
|
+
text?: string;
|
|
131
|
+
/** Whether this is a partial result */
|
|
132
|
+
isPartial?: boolean;
|
|
133
|
+
/** Function call data */
|
|
134
|
+
functionCall?: {
|
|
135
|
+
name: string;
|
|
136
|
+
arguments: Record<string, unknown>;
|
|
137
|
+
};
|
|
138
|
+
/** Function result data */
|
|
139
|
+
functionResult?: {
|
|
140
|
+
name: string;
|
|
141
|
+
result: unknown;
|
|
142
|
+
};
|
|
143
|
+
/** Error information */
|
|
144
|
+
error?: {
|
|
145
|
+
code: string;
|
|
146
|
+
message: string;
|
|
147
|
+
};
|
|
148
|
+
/** Timestamp */
|
|
149
|
+
timestamp: Date;
|
|
150
|
+
};
|
|
151
|
+
/**
|
|
152
|
+
* Realtime event handler callbacks
|
|
153
|
+
*/
|
|
154
|
+
export type RealtimeEventHandlers = {
|
|
155
|
+
/** Called when audio is received */
|
|
156
|
+
onAudio?: (chunk: RealtimeAudioChunk) => void;
|
|
157
|
+
/** Called when text/transcript is received */
|
|
158
|
+
onTranscript?: (text: string, isFinal: boolean) => void;
|
|
159
|
+
/** Called when the model generates text */
|
|
160
|
+
onText?: (text: string, isFinal: boolean) => void;
|
|
161
|
+
/** Called when a function call is requested */
|
|
162
|
+
onFunctionCall?: (name: string, args: Record<string, unknown>) => Promise<unknown>;
|
|
163
|
+
/** Called when session state changes */
|
|
164
|
+
onStateChange?: (state: RealtimeSessionState) => void;
|
|
165
|
+
/** Called when an error occurs */
|
|
166
|
+
onError?: (error: Error) => void;
|
|
167
|
+
/** Called when a turn starts */
|
|
168
|
+
onTurnStart?: () => void;
|
|
169
|
+
/** Called when a turn ends */
|
|
170
|
+
onTurnEnd?: () => void;
|
|
171
|
+
};
|
|
172
|
+
/**
|
|
173
|
+
* Realtime voice provider type (bidirectional audio)
|
|
174
|
+
*/
|
|
175
|
+
export type RealtimeVoiceProvider = {
|
|
176
|
+
/** Provider name identifier */
|
|
177
|
+
readonly name: string;
|
|
178
|
+
/** Get supported capabilities */
|
|
179
|
+
getCapabilities(): RealtimeProviderCapability[];
|
|
180
|
+
/** Check if provider is properly configured */
|
|
181
|
+
isConfigured(): boolean;
|
|
182
|
+
/** Validate provider configuration */
|
|
183
|
+
validateConfig(): Promise<{
|
|
184
|
+
valid: boolean;
|
|
185
|
+
errors: string[];
|
|
186
|
+
}>;
|
|
187
|
+
/** Get provider-specific options schema */
|
|
188
|
+
getOptionsSchema?(): Record<string, unknown>;
|
|
189
|
+
/**
|
|
190
|
+
* Create a new realtime session
|
|
191
|
+
*/
|
|
192
|
+
connect(config: RealtimeConfig): Promise<RealtimeSession>;
|
|
193
|
+
/**
|
|
194
|
+
* Check if connected
|
|
195
|
+
*/
|
|
196
|
+
isConnected(): boolean;
|
|
197
|
+
/**
|
|
198
|
+
* Disconnect from realtime session
|
|
199
|
+
*/
|
|
200
|
+
disconnect(): Promise<void>;
|
|
201
|
+
/**
|
|
202
|
+
* Get current session configuration
|
|
203
|
+
*/
|
|
204
|
+
getSessionConfig(): RealtimeConfig | null;
|
|
205
|
+
};
|
|
206
|
+
export type RealtimeHandler = {
|
|
207
|
+
readonly name: string;
|
|
208
|
+
connect(config: RealtimeConfig): Promise<RealtimeSession>;
|
|
209
|
+
disconnect(): Promise<void>;
|
|
210
|
+
isConnected(): boolean;
|
|
211
|
+
getSession(): RealtimeSession | null;
|
|
212
|
+
sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
|
|
213
|
+
sendText?(text: string): Promise<void>;
|
|
214
|
+
triggerResponse?(): Promise<void>;
|
|
215
|
+
cancelResponse?(): Promise<void>;
|
|
216
|
+
on(handlers: RealtimeEventHandlers): void;
|
|
217
|
+
off(): void;
|
|
218
|
+
isConfigured(): boolean;
|
|
219
|
+
getSupportedFormats(): TTSAudioFormat[];
|
|
220
|
+
};
|
|
221
|
+
/**
|
|
222
|
+
* Realtime error codes
|
|
223
|
+
*/
|
|
224
|
+
export declare const REALTIME_ERROR_CODES: {
|
|
225
|
+
readonly CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED";
|
|
226
|
+
readonly SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT";
|
|
227
|
+
readonly PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR";
|
|
228
|
+
readonly AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR";
|
|
229
|
+
readonly PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED";
|
|
230
|
+
readonly PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED";
|
|
231
|
+
readonly SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE";
|
|
232
|
+
readonly SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE";
|
|
233
|
+
readonly INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE";
|
|
234
|
+
};
|
|
235
|
+
/**
|
|
236
|
+
* Default realtime configuration
|
|
237
|
+
*/
|
|
238
|
+
export declare const DEFAULT_REALTIME_CONFIG: Partial<RealtimeConfig>;
|
|
239
|
+
/**
|
|
240
|
+
* Type guard for valid RealtimeConfig
|
|
241
|
+
*/
|
|
242
|
+
export declare function isValidRealtimeConfig(config: unknown): config is RealtimeConfig;
|
|
243
|
+
export {};
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime Voice Type Definitions for NeuroLink
|
|
3
|
+
*
|
|
4
|
+
* All realtime/bidirectional voice types: session, config, messages,
|
|
5
|
+
* event handlers, provider types, handler types, error codes, defaults,
|
|
6
|
+
* and type guards.
|
|
7
|
+
*
|
|
8
|
+
* @module types/realtime
|
|
9
|
+
*/
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// REALTIME ERROR CODES
|
|
12
|
+
// ============================================================================
|
|
13
|
+
/**
|
|
14
|
+
* Realtime error codes
|
|
15
|
+
*/
|
|
16
|
+
export const REALTIME_ERROR_CODES = {
|
|
17
|
+
CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED",
|
|
18
|
+
SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT",
|
|
19
|
+
PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR",
|
|
20
|
+
AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR",
|
|
21
|
+
PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED",
|
|
22
|
+
PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED",
|
|
23
|
+
SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE",
|
|
24
|
+
SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE",
|
|
25
|
+
INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE",
|
|
26
|
+
};
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// REALTIME DEFAULTS
|
|
29
|
+
// ============================================================================
|
|
30
|
+
/**
|
|
31
|
+
* Default realtime configuration
|
|
32
|
+
*/
|
|
33
|
+
export const DEFAULT_REALTIME_CONFIG = {
|
|
34
|
+
timeout: 30000,
|
|
35
|
+
inputSampleRate: 24000,
|
|
36
|
+
outputSampleRate: 24000,
|
|
37
|
+
vadEnabled: true,
|
|
38
|
+
vadThreshold: 0.5,
|
|
39
|
+
turnDetection: "server_vad",
|
|
40
|
+
};
|
|
41
|
+
// ============================================================================
|
|
42
|
+
// REALTIME TYPE GUARDS
|
|
43
|
+
// ============================================================================
|
|
44
|
+
/**
|
|
45
|
+
* Type guard for valid RealtimeConfig
|
|
46
|
+
*/
|
|
47
|
+
export function isValidRealtimeConfig(config) {
|
|
48
|
+
if (!config || typeof config !== "object") {
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
const conf = config;
|
|
52
|
+
if (!conf.provider ||
|
|
53
|
+
!["openai-realtime", "gemini-live"].includes(conf.provider)) {
|
|
54
|
+
return false;
|
|
55
|
+
}
|
|
56
|
+
if (conf.timeout !== undefined) {
|
|
57
|
+
if (typeof conf.timeout !== "number" || conf.timeout <= 0) {
|
|
58
|
+
return false;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (conf.vadThreshold !== undefined) {
|
|
62
|
+
if (typeof conf.vadThreshold !== "number" ||
|
|
63
|
+
conf.vadThreshold < 0 ||
|
|
64
|
+
conf.vadThreshold > 1) {
|
|
65
|
+
return false;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
//# sourceMappingURL=realtime.js.map
|
|
@@ -1119,3 +1119,71 @@ export type CobraInstance = {
|
|
|
1119
1119
|
process: (pcm: Int16Array) => number;
|
|
1120
1120
|
release: () => void;
|
|
1121
1121
|
};
|
|
1122
|
+
/**
|
|
1123
|
+
* Per-WebSocket-connection context object passed to the voice connection
|
|
1124
|
+
* handler. Holds shared singletons that all per-connection state derives from.
|
|
1125
|
+
*
|
|
1126
|
+
* (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
|
|
1127
|
+
*/
|
|
1128
|
+
export type ServerVoiceConnectionCtx = {
|
|
1129
|
+
neurolink: NeuroLink;
|
|
1130
|
+
accessKey: string;
|
|
1131
|
+
};
|
|
1132
|
+
/**
|
|
1133
|
+
* Per-session mutable state for one voice WebSocket connection.
|
|
1134
|
+
*
|
|
1135
|
+
* Threaded through the voice connection helper functions so each connection
|
|
1136
|
+
* has fully isolated turn / TTS / VAD / barge-in state. The class types
|
|
1137
|
+
* (`FrameBus`, `TurnManager`, `CartesiaStream`) are imported as types here so
|
|
1138
|
+
* that this file remains the single source of truth — consumers import this
|
|
1139
|
+
* type via the barrel and do not redefine it locally.
|
|
1140
|
+
*
|
|
1141
|
+
* (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
|
|
1142
|
+
*/
|
|
1143
|
+
export type ServerVoiceSessionState = {
|
|
1144
|
+
cobra: CobraInstance | null;
|
|
1145
|
+
FRAME_LENGTH: number;
|
|
1146
|
+
FRAME_BYTES: number;
|
|
1147
|
+
bus: import("../server/voice/frameBus.js").FrameBus;
|
|
1148
|
+
turnManager: import("../server/voice/turnManager.js").TurnManager;
|
|
1149
|
+
sonioxWs: import("ws").WebSocket | null;
|
|
1150
|
+
keepAliveTimer: NodeJS.Timeout | null;
|
|
1151
|
+
sonioxReconnectTimer: ReturnType<typeof setTimeout> | null;
|
|
1152
|
+
sessionClosed: boolean;
|
|
1153
|
+
transcriptBuffer: string;
|
|
1154
|
+
activeTTS: import("../adapters/tts/cartesiaHandler.js").CartesiaStream | null;
|
|
1155
|
+
conversation: ConversationMessage[];
|
|
1156
|
+
currentTurnId: number;
|
|
1157
|
+
activePipelineTurnId: number | null;
|
|
1158
|
+
turnAborters: Set<{
|
|
1159
|
+
aborted: boolean;
|
|
1160
|
+
}>;
|
|
1161
|
+
playbackResetTimer: NodeJS.Timeout | null;
|
|
1162
|
+
bargeInLockedUntil: number;
|
|
1163
|
+
isSpeaking: boolean;
|
|
1164
|
+
silenceFrameCount: number;
|
|
1165
|
+
voiceFrameCount: number;
|
|
1166
|
+
frameRemainder: Buffer;
|
|
1167
|
+
};
|
|
1168
|
+
/**
|
|
1169
|
+
* Options accepted by `setupWebSocket()` in `server/voice/voiceWebSocketHandler.ts`.
|
|
1170
|
+
*
|
|
1171
|
+
* (Server-prefixed per CLAUDE.md Rule 9 — server-tier type. Lives in
|
|
1172
|
+
* `server.ts` rather than `cli.ts` because it configures a server-side
|
|
1173
|
+
* WebSocket upgrade handler, not CLI argument parsing.)
|
|
1174
|
+
*/
|
|
1175
|
+
export type ServerVoiceWebSocketOptions = {
|
|
1176
|
+
/**
|
|
1177
|
+
* Optional shared-secret bearer token. When set, the WebSocket upgrade
|
|
1178
|
+
* handshake must include `Authorization: Bearer <token>` or
|
|
1179
|
+
* `?token=<token>` in the URL. Without this, anyone reachable on the
|
|
1180
|
+
* network can open a session and consume Soniox / Cartesia / LLM credits.
|
|
1181
|
+
*/
|
|
1182
|
+
authToken?: string;
|
|
1183
|
+
/**
|
|
1184
|
+
* Maximum WebSocket message size in bytes. Defaults to 1 MiB. Caps both
|
|
1185
|
+
* inbound audio frames and any client control messages — guards against
|
|
1186
|
+
* OOM via oversized uploads.
|
|
1187
|
+
*/
|
|
1188
|
+
maxPayload?: number;
|
|
1189
|
+
};
|
package/dist/lib/types/span.d.ts
CHANGED
package/dist/lib/types/span.js
CHANGED
|
@@ -38,6 +38,8 @@ export var SpanType;
|
|
|
38
38
|
SpanType["WORKFLOW"] = "workflow";
|
|
39
39
|
/** TTS synthesis */
|
|
40
40
|
SpanType["TTS"] = "tts";
|
|
41
|
+
/** STT transcription */
|
|
42
|
+
SpanType["STT"] = "stt";
|
|
41
43
|
/** Server adapter request */
|
|
42
44
|
SpanType["SERVER_REQUEST"] = "server.request";
|
|
43
45
|
/** Custom span */
|
|
@@ -11,7 +11,8 @@ import type { ChatMessage } from "./conversation.js";
|
|
|
11
11
|
import type { StreamNoOutputSentinel } from "./noOutputSentinel.js";
|
|
12
12
|
import type { AdditionalMemoryUser } from "./generate.js";
|
|
13
13
|
import type { AIModelProviderConfig, NeurolinkCredentials } from "./providers.js";
|
|
14
|
-
import type { TTSChunk, TTSOptions } from "./tts.js";
|
|
14
|
+
import type { TTSChunk, TTSOptions, TTSResult } from "./tts.js";
|
|
15
|
+
import type { STTOptions, STTResult } from "./stt.js";
|
|
15
16
|
import type { StandardRecord, ValidationSchema } from "./aliases.js";
|
|
16
17
|
import type { FileWithMetadata } from "./file.js";
|
|
17
18
|
import type { WorkflowConfig } from "./workflow.js";
|
|
@@ -127,9 +128,9 @@ export type AudioChunk = {
|
|
|
127
128
|
* ```typescript
|
|
128
129
|
* const audioBuffer: Buffer[] = [];
|
|
129
130
|
* for await (const chunk of result.stream) {
|
|
130
|
-
* if (chunk.type === "
|
|
131
|
-
* audioBuffer.push(chunk.
|
|
132
|
-
* if (chunk.
|
|
131
|
+
* if (chunk.type === "tts_audio") {
|
|
132
|
+
* audioBuffer.push(chunk.audio.data); // TypeScript knows 'audio' exists
|
|
133
|
+
* if (chunk.audio.isFinal) {
|
|
133
134
|
* const fullAudio = Buffer.concat(audioBuffer);
|
|
134
135
|
* fs.writeFileSync('output.mp3', fullAudio);
|
|
135
136
|
* }
|
|
@@ -144,8 +145,8 @@ export type AudioChunk = {
|
|
|
144
145
|
* case "text":
|
|
145
146
|
* process.stdout.write(chunk.content);
|
|
146
147
|
* break;
|
|
147
|
-
* case "
|
|
148
|
-
* playAudioChunk(chunk.
|
|
148
|
+
* case "tts_audio":
|
|
149
|
+
* playAudioChunk(chunk.audio.data);
|
|
149
150
|
* break;
|
|
150
151
|
* }
|
|
151
152
|
* }
|
|
@@ -157,10 +158,12 @@ export type StreamChunk = {
|
|
|
157
158
|
/** Text content chunk */
|
|
158
159
|
content: string;
|
|
159
160
|
} | {
|
|
160
|
-
/** Discriminator for audio chunks
|
|
161
|
-
|
|
161
|
+
/** Discriminator for synthesized TTS audio chunks. Uses `tts_audio`
|
|
162
|
+
* (not `audio`) to avoid colliding with realtime AudioChunk and to
|
|
163
|
+
* match the runtime shape emitted by `BaseProvider.stream()`. */
|
|
164
|
+
type: "tts_audio";
|
|
162
165
|
/** TTS audio chunk data */
|
|
163
|
-
|
|
166
|
+
audio: TTSChunk;
|
|
164
167
|
};
|
|
165
168
|
export type StreamOptions = {
|
|
166
169
|
input: {
|
|
@@ -226,9 +229,9 @@ export type StreamOptions = {
|
|
|
226
229
|
* for await (const chunk of result.stream) {
|
|
227
230
|
* if (chunk.type === "text") {
|
|
228
231
|
* process.stdout.write(chunk.content);
|
|
229
|
-
* } else if (chunk.type === "
|
|
232
|
+
* } else if (chunk.type === "tts_audio") {
|
|
230
233
|
* // Handle audio chunk
|
|
231
|
-
* playAudioChunk(chunk.
|
|
234
|
+
* playAudioChunk(chunk.audio.data);
|
|
232
235
|
* }
|
|
233
236
|
* }
|
|
234
237
|
* ```
|
|
@@ -249,6 +252,15 @@ export type StreamOptions = {
|
|
|
249
252
|
* ```
|
|
250
253
|
*/
|
|
251
254
|
tts?: TTSOptions;
|
|
255
|
+
/**
|
|
256
|
+
* Speech-to-Text (STT) configuration for streaming
|
|
257
|
+
*
|
|
258
|
+
* When enabled, audio from `stt.audio` is transcribed before streaming begins.
|
|
259
|
+
*/
|
|
260
|
+
stt?: STTOptions & {
|
|
261
|
+
provider?: string;
|
|
262
|
+
audio?: Buffer | ArrayBuffer;
|
|
263
|
+
};
|
|
252
264
|
/**
|
|
253
265
|
* Thinking/reasoning configuration for extended thinking models
|
|
254
266
|
*
|
|
@@ -491,6 +503,9 @@ export type StreamResult = {
|
|
|
491
503
|
} | StreamNoOutputSentinel | {
|
|
492
504
|
type: "audio";
|
|
493
505
|
audio: AudioChunk;
|
|
506
|
+
} | {
|
|
507
|
+
type: "tts_audio";
|
|
508
|
+
audio: TTSChunk;
|
|
494
509
|
} | {
|
|
495
510
|
type: "image";
|
|
496
511
|
imageOutput: {
|
|
@@ -499,9 +514,6 @@ export type StreamResult = {
|
|
|
499
514
|
} | {
|
|
500
515
|
content: string;
|
|
501
516
|
type?: "preliminary" | "final";
|
|
502
|
-
} | {
|
|
503
|
-
type: "audio";
|
|
504
|
-
audio: AudioChunk;
|
|
505
517
|
}>;
|
|
506
518
|
provider?: string;
|
|
507
519
|
model?: string;
|
|
@@ -566,6 +578,16 @@ export type StreamResult = {
|
|
|
566
578
|
workflowId: string;
|
|
567
579
|
workflowName: string;
|
|
568
580
|
};
|
|
581
|
+
/** STT transcription result (when stt option is used) */
|
|
582
|
+
transcription?: STTResult;
|
|
583
|
+
/**
|
|
584
|
+
* TTS Mode 2 result (when `tts.enabled && tts.useAiResponse`).
|
|
585
|
+
* Resolves with the synthesized audio after the stream completes;
|
|
586
|
+
* resolves to undefined if TTS was not enabled or synthesis failed.
|
|
587
|
+
* The same audio is also yielded as a final chunk on `stream` for callers
|
|
588
|
+
* that prefer to consume it inline.
|
|
589
|
+
*/
|
|
590
|
+
audio?: Promise<TTSResult | undefined>;
|
|
569
591
|
};
|
|
570
592
|
/**
|
|
571
593
|
* Enhanced provider type with stream method
|