@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -10,6 +10,7 @@ import type { MiddlewareFactoryOptions, OnFinishCallback, OnErrorCallback } from
10
10
  import type { DirectorModeOptions, DirectorSegment, VideoGenerationResult, VideoOutputOptions } from "./multimodal.js";
11
11
  import type { PPTGenerationResult, PPTOutputOptions } from "./ppt.js";
12
12
  import type { TTSOptions, TTSResult } from "./tts.js";
13
+ import type { STTOptions, STTResult } from "./stt.js";
13
14
  import type { StandardRecord, ValidationSchema, ZodUnknownSchema } from "./aliases.js";
14
15
  import type { NeurolinkCredentials } from "./providers.js";
15
16
  import type { FileWithMetadata } from "./file.js";
@@ -142,6 +143,27 @@ export type GenerateOptions = {
142
143
  * ```
143
144
  */
144
145
  tts?: TTSOptions;
146
+ /**
147
+ * Speech-to-Text (STT) configuration
148
+ *
149
+ * Enable audio transcription. When enabled, the audio provided via `stt.audio`
150
+ * will be transcribed to text and used as the prompt.
151
+ *
152
+ * @example
153
+ * ```typescript
154
+ * const neurolink = new NeuroLink();
155
+ * const result = await neurolink.generate({
156
+ * input: { text: "" },
157
+ * provider: "openai",
158
+ * stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
159
+ * });
160
+ * // STT transcribes the audio, result.transcription contains the transcription
161
+ * ```
162
+ */
163
+ stt?: STTOptions & {
164
+ provider?: string;
165
+ audio?: Buffer | ArrayBuffer;
166
+ };
145
167
  /**
146
168
  * Thinking/reasoning configuration for extended thinking models
147
169
  *
@@ -660,6 +682,8 @@ export type GenerateResult = {
660
682
  reasoning?: string;
661
683
  /** Token count for reasoning content */
662
684
  reasoningTokens?: number;
685
+ /** STT transcription result (present when stt.enabled is true and audio input was provided) */
686
+ transcription?: STTResult;
663
687
  retries?: {
664
688
  count: number;
665
689
  errors: Array<{
@@ -868,6 +892,27 @@ export type TextGenerationOptions = {
868
892
  * ```
869
893
  */
870
894
  tts?: TTSOptions;
895
+ /**
896
+ * Speech-to-Text (STT) configuration
897
+ *
898
+ * Enable audio transcription. When enabled, the audio provided via `stt.audio`
899
+ * will be transcribed to text and used as the prompt.
900
+ *
901
+ * @example
902
+ * ```typescript
903
+ * const neurolink = new NeuroLink();
904
+ * const result = await neurolink.generate({
905
+ * input: { text: "" },
906
+ * provider: "openai",
907
+ * stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
908
+ * });
909
+ * // STT transcribes the audio, result.transcription contains the transcription
910
+ * ```
911
+ */
912
+ stt?: STTOptions & {
913
+ provider?: string;
914
+ audio?: Buffer | ArrayBuffer;
915
+ };
871
916
  enableEvaluation?: boolean;
872
917
  enableAnalytics?: boolean;
873
918
  context?: Record<string, JsonValue>;
@@ -1033,6 +1078,8 @@ export type TextGenerationResult = {
1033
1078
  analytics?: AnalyticsData;
1034
1079
  evaluation?: EvaluationData;
1035
1080
  audio?: TTSResult;
1081
+ /** STT transcription result (present when stt input was processed) */
1082
+ transcription?: STTResult;
1036
1083
  /** Video generation result */
1037
1084
  video?: VideoGenerationResult;
1038
1085
  /** PowerPoint generation result */
@@ -232,3 +232,6 @@ export type HITLManager = {
232
232
  on(event: string, listener: (...args: unknown[]) => void): HITLManager;
233
233
  emit(event: string, ...args: unknown[]): boolean;
234
234
  };
235
+ export type HITLExecutionState = {
236
+ triggered: boolean;
237
+ };
@@ -48,7 +48,7 @@ export * from "./subscription.js";
48
48
  export * from "./task.js";
49
49
  export * from "./taskClassification.js";
50
50
  export * from "./tools.js";
51
- export * from "./tts.js";
51
+ export * from "./voice.js";
52
52
  export * from "./universalProviderOptions.js";
53
53
  export * from "./utilities.js";
54
54
  export * from "./workflow.js";
@@ -49,7 +49,7 @@ export * from "./subscription.js";
49
49
  export * from "./task.js";
50
50
  export * from "./taskClassification.js";
51
51
  export * from "./tools.js";
52
- export * from "./tts.js";
52
+ export * from "./voice.js";
53
53
  export * from "./universalProviderOptions.js";
54
54
  export * from "./utilities.js";
55
55
  export * from "./workflow.js";
@@ -0,0 +1,243 @@
1
+ /**
2
+ * Realtime Voice Type Definitions for NeuroLink
3
+ *
4
+ * All realtime/bidirectional voice types: session, config, messages,
5
+ * event handlers, provider types, handler types, error codes, defaults,
6
+ * and type guards.
7
+ *
8
+ * @module types/realtime
9
+ */
10
+ import type { TTSAudioFormat } from "./tts.js";
11
+ type RealtimeProviderCapability = "tts" | "stt" | "realtime" | "streaming";
12
+ /**
13
+ * Realtime session state
14
+ */
15
+ export type RealtimeSessionState = "disconnected" | "connecting" | "connected" | "disconnecting" | "error";
16
+ /**
17
+ * Realtime voice configuration
18
+ */
19
+ export type RealtimeConfig = {
20
+ /**
21
+ * Provider to use. Must match the handler key registered with
22
+ * `RealtimeProcessor.registerHandler()` — currently `"openai-realtime"`
23
+ * (registered in `providerRegistry.ts`) and `"gemini-live"` (registered in
24
+ * `providerRegistry.ts`). Aliasing is handled at registry/CLI parse time,
25
+ * not here.
26
+ */
27
+ provider: "openai-realtime" | "gemini-live";
28
+ /** API key */
29
+ apiKey?: string;
30
+ /** Model to use */
31
+ model?: string;
32
+ /** Voice for TTS output */
33
+ voice?: string;
34
+ /** Input language */
35
+ inputLanguage?: string;
36
+ /** Output language */
37
+ outputLanguage?: string;
38
+ /** System prompt for the AI */
39
+ systemPrompt?: string;
40
+ /** Session timeout in milliseconds */
41
+ timeout?: number;
42
+ /** Audio input format */
43
+ inputFormat?: TTSAudioFormat;
44
+ /** Audio output format */
45
+ outputFormat?: TTSAudioFormat;
46
+ /** Input sample rate */
47
+ inputSampleRate?: number;
48
+ /** Output sample rate */
49
+ outputSampleRate?: number;
50
+ /** Enable voice activity detection */
51
+ vadEnabled?: boolean;
52
+ /** VAD threshold (0-1) */
53
+ vadThreshold?: number;
54
+ /** Turn detection mode */
55
+ turnDetection?: "server_vad" | "manual";
56
+ /** Instructions/system prompt for the session */
57
+ instructions?: string;
58
+ /** Temperature for AI responses */
59
+ temperature?: number;
60
+ /** Tools/functions available to the model */
61
+ tools?: RealtimeTool[];
62
+ };
63
+ /**
64
+ * Realtime tool definition
65
+ */
66
+ export type RealtimeTool = {
67
+ /** Tool name */
68
+ name: string;
69
+ /** Tool description */
70
+ description: string;
71
+ /** JSON schema for parameters */
72
+ parameters: Record<string, unknown>;
73
+ };
74
+ /**
75
+ * Realtime session information
76
+ */
77
+ export type RealtimeSession = {
78
+ /** Session ID */
79
+ id: string;
80
+ /** Current state */
81
+ state: RealtimeSessionState;
82
+ /** Provider name — narrowed to the validated config provider union so
83
+ * session state stays aligned with what `connect()` accepts. */
84
+ provider: RealtimeConfig["provider"];
85
+ /** Model being used */
86
+ model?: string;
87
+ /** Session creation time */
88
+ createdAt: Date;
89
+ /** Last activity time */
90
+ lastActivityAt: Date;
91
+ /** Session configuration */
92
+ config: RealtimeConfig;
93
+ /** Check if session is open */
94
+ isOpen?: () => boolean;
95
+ /** Close the session */
96
+ close?: () => Promise<void>;
97
+ };
98
+ /**
99
+ * Realtime audio chunk
100
+ */
101
+ export type RealtimeAudioChunk = {
102
+ /** Audio data */
103
+ data: Buffer;
104
+ /** Chunk sequence number */
105
+ index: number;
106
+ /** Whether this is the final chunk */
107
+ isFinal: boolean;
108
+ /** Audio format */
109
+ format: TTSAudioFormat;
110
+ /** Sample rate */
111
+ sampleRate?: number;
112
+ /** Duration of this chunk in milliseconds */
113
+ durationMs?: number;
114
+ };
115
+ /**
116
+ * Realtime message types
117
+ */
118
+ export type RealtimeMessageType = "audio" | "text" | "transcript" | "function_call" | "function_result" | "error" | "session_update" | "turn_start" | "turn_end";
119
+ /**
120
+ * Realtime message
121
+ */
122
+ export type RealtimeMessage = {
123
+ /** Message type */
124
+ type: RealtimeMessageType;
125
+ /** Message ID */
126
+ id?: string;
127
+ /** Audio data (for audio messages) */
128
+ audio?: RealtimeAudioChunk;
129
+ /** Text content (for text/transcript messages) */
130
+ text?: string;
131
+ /** Whether this is a partial result */
132
+ isPartial?: boolean;
133
+ /** Function call data */
134
+ functionCall?: {
135
+ name: string;
136
+ arguments: Record<string, unknown>;
137
+ };
138
+ /** Function result data */
139
+ functionResult?: {
140
+ name: string;
141
+ result: unknown;
142
+ };
143
+ /** Error information */
144
+ error?: {
145
+ code: string;
146
+ message: string;
147
+ };
148
+ /** Timestamp */
149
+ timestamp: Date;
150
+ };
151
+ /**
152
+ * Realtime event handler callbacks
153
+ */
154
+ export type RealtimeEventHandlers = {
155
+ /** Called when audio is received */
156
+ onAudio?: (chunk: RealtimeAudioChunk) => void;
157
+ /** Called when text/transcript is received */
158
+ onTranscript?: (text: string, isFinal: boolean) => void;
159
+ /** Called when the model generates text */
160
+ onText?: (text: string, isFinal: boolean) => void;
161
+ /** Called when a function call is requested */
162
+ onFunctionCall?: (name: string, args: Record<string, unknown>) => Promise<unknown>;
163
+ /** Called when session state changes */
164
+ onStateChange?: (state: RealtimeSessionState) => void;
165
+ /** Called when an error occurs */
166
+ onError?: (error: Error) => void;
167
+ /** Called when a turn starts */
168
+ onTurnStart?: () => void;
169
+ /** Called when a turn ends */
170
+ onTurnEnd?: () => void;
171
+ };
172
+ /**
173
+ * Realtime voice provider type (bidirectional audio)
174
+ */
175
+ export type RealtimeVoiceProvider = {
176
+ /** Provider name identifier */
177
+ readonly name: string;
178
+ /** Get supported capabilities */
179
+ getCapabilities(): RealtimeProviderCapability[];
180
+ /** Check if provider is properly configured */
181
+ isConfigured(): boolean;
182
+ /** Validate provider configuration */
183
+ validateConfig(): Promise<{
184
+ valid: boolean;
185
+ errors: string[];
186
+ }>;
187
+ /** Get provider-specific options schema */
188
+ getOptionsSchema?(): Record<string, unknown>;
189
+ /**
190
+ * Create a new realtime session
191
+ */
192
+ connect(config: RealtimeConfig): Promise<RealtimeSession>;
193
+ /**
194
+ * Check if connected
195
+ */
196
+ isConnected(): boolean;
197
+ /**
198
+ * Disconnect from realtime session
199
+ */
200
+ disconnect(): Promise<void>;
201
+ /**
202
+ * Get current session configuration
203
+ */
204
+ getSessionConfig(): RealtimeConfig | null;
205
+ };
206
+ export type RealtimeHandler = {
207
+ readonly name: string;
208
+ connect(config: RealtimeConfig): Promise<RealtimeSession>;
209
+ disconnect(): Promise<void>;
210
+ isConnected(): boolean;
211
+ getSession(): RealtimeSession | null;
212
+ sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
213
+ sendText?(text: string): Promise<void>;
214
+ triggerResponse?(): Promise<void>;
215
+ cancelResponse?(): Promise<void>;
216
+ on(handlers: RealtimeEventHandlers): void;
217
+ off(): void;
218
+ isConfigured(): boolean;
219
+ getSupportedFormats(): TTSAudioFormat[];
220
+ };
221
+ /**
222
+ * Realtime error codes
223
+ */
224
+ export declare const REALTIME_ERROR_CODES: {
225
+ readonly CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED";
226
+ readonly SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT";
227
+ readonly PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR";
228
+ readonly AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR";
229
+ readonly PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED";
230
+ readonly PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED";
231
+ readonly SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE";
232
+ readonly SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE";
233
+ readonly INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE";
234
+ };
235
+ /**
236
+ * Default realtime configuration
237
+ */
238
+ export declare const DEFAULT_REALTIME_CONFIG: Partial<RealtimeConfig>;
239
+ /**
240
+ * Type guard for valid RealtimeConfig
241
+ */
242
+ export declare function isValidRealtimeConfig(config: unknown): config is RealtimeConfig;
243
+ export {};
@@ -0,0 +1,70 @@
1
+ /**
2
+ * Realtime Voice Type Definitions for NeuroLink
3
+ *
4
+ * All realtime/bidirectional voice types: session, config, messages,
5
+ * event handlers, provider types, handler types, error codes, defaults,
6
+ * and type guards.
7
+ *
8
+ * @module types/realtime
9
+ */
10
+ // ============================================================================
11
+ // REALTIME ERROR CODES
12
+ // ============================================================================
13
+ /**
14
+ * Realtime error codes
15
+ */
16
+ export const REALTIME_ERROR_CODES = {
17
+ CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED",
18
+ SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT",
19
+ PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR",
20
+ AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR",
21
+ PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED",
22
+ PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED",
23
+ SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE",
24
+ SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE",
25
+ INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE",
26
+ };
27
+ // ============================================================================
28
+ // REALTIME DEFAULTS
29
+ // ============================================================================
30
+ /**
31
+ * Default realtime configuration
32
+ */
33
+ export const DEFAULT_REALTIME_CONFIG = {
34
+ timeout: 30000,
35
+ inputSampleRate: 24000,
36
+ outputSampleRate: 24000,
37
+ vadEnabled: true,
38
+ vadThreshold: 0.5,
39
+ turnDetection: "server_vad",
40
+ };
41
+ // ============================================================================
42
+ // REALTIME TYPE GUARDS
43
+ // ============================================================================
44
+ /**
45
+ * Type guard for valid RealtimeConfig
46
+ */
47
+ export function isValidRealtimeConfig(config) {
48
+ if (!config || typeof config !== "object") {
49
+ return false;
50
+ }
51
+ const conf = config;
52
+ if (!conf.provider ||
53
+ !["openai-realtime", "gemini-live"].includes(conf.provider)) {
54
+ return false;
55
+ }
56
+ if (conf.timeout !== undefined) {
57
+ if (typeof conf.timeout !== "number" || conf.timeout <= 0) {
58
+ return false;
59
+ }
60
+ }
61
+ if (conf.vadThreshold !== undefined) {
62
+ if (typeof conf.vadThreshold !== "number" ||
63
+ conf.vadThreshold < 0 ||
64
+ conf.vadThreshold > 1) {
65
+ return false;
66
+ }
67
+ }
68
+ return true;
69
+ }
70
+ //# sourceMappingURL=realtime.js.map
@@ -1119,3 +1119,71 @@ export type CobraInstance = {
1119
1119
  process: (pcm: Int16Array) => number;
1120
1120
  release: () => void;
1121
1121
  };
1122
+ /**
1123
+ * Per-WebSocket-connection context object passed to the voice connection
1124
+ * handler. Holds shared singletons that all per-connection state derives from.
1125
+ *
1126
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
1127
+ */
1128
+ export type ServerVoiceConnectionCtx = {
1129
+ neurolink: NeuroLink;
1130
+ accessKey: string;
1131
+ };
1132
+ /**
1133
+ * Per-session mutable state for one voice WebSocket connection.
1134
+ *
1135
+ * Threaded through the voice connection helper functions so each connection
1136
+ * has fully isolated turn / TTS / VAD / barge-in state. The class types
1137
+ * (`FrameBus`, `TurnManager`, `CartesiaStream`) are imported as types here so
1138
+ * that this file remains the single source of truth — consumers import this
1139
+ * type via the barrel and do not redefine it locally.
1140
+ *
1141
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
1142
+ */
1143
+ export type ServerVoiceSessionState = {
1144
+ cobra: CobraInstance | null;
1145
+ FRAME_LENGTH: number;
1146
+ FRAME_BYTES: number;
1147
+ bus: import("../server/voice/frameBus.js").FrameBus;
1148
+ turnManager: import("../server/voice/turnManager.js").TurnManager;
1149
+ sonioxWs: import("ws").WebSocket | null;
1150
+ keepAliveTimer: NodeJS.Timeout | null;
1151
+ sonioxReconnectTimer: ReturnType<typeof setTimeout> | null;
1152
+ sessionClosed: boolean;
1153
+ transcriptBuffer: string;
1154
+ activeTTS: import("../adapters/tts/cartesiaHandler.js").CartesiaStream | null;
1155
+ conversation: ConversationMessage[];
1156
+ currentTurnId: number;
1157
+ activePipelineTurnId: number | null;
1158
+ turnAborters: Set<{
1159
+ aborted: boolean;
1160
+ }>;
1161
+ playbackResetTimer: NodeJS.Timeout | null;
1162
+ bargeInLockedUntil: number;
1163
+ isSpeaking: boolean;
1164
+ silenceFrameCount: number;
1165
+ voiceFrameCount: number;
1166
+ frameRemainder: Buffer;
1167
+ };
1168
+ /**
1169
+ * Options accepted by `setupWebSocket()` in `server/voice/voiceWebSocketHandler.ts`.
1170
+ *
1171
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type. Lives in
1172
+ * `server.ts` rather than `cli.ts` because it configures a server-side
1173
+ * WebSocket upgrade handler, not CLI argument parsing.)
1174
+ */
1175
+ export type ServerVoiceWebSocketOptions = {
1176
+ /**
1177
+ * Optional shared-secret bearer token. When set, the WebSocket upgrade
1178
+ * handshake must include `Authorization: Bearer <token>` or
1179
+ * `?token=<token>` in the URL. Without this, anyone reachable on the
1180
+ * network can open a session and consume Soniox / Cartesia / LLM credits.
1181
+ */
1182
+ authToken?: string;
1183
+ /**
1184
+ * Maximum WebSocket message size in bytes. Defaults to 1 MiB. Caps both
1185
+ * inbound audio frames and any client control messages — guards against
1186
+ * OOM via oversized uploads.
1187
+ */
1188
+ maxPayload?: number;
1189
+ };
@@ -37,6 +37,8 @@ export declare enum SpanType {
37
37
  WORKFLOW = "workflow",
38
38
  /** TTS synthesis */
39
39
  TTS = "tts",
40
+ /** STT transcription */
41
+ STT = "stt",
40
42
  /** Server adapter request */
41
43
  SERVER_REQUEST = "server.request",
42
44
  /** Custom span */
@@ -38,6 +38,8 @@ export var SpanType;
38
38
  SpanType["WORKFLOW"] = "workflow";
39
39
  /** TTS synthesis */
40
40
  SpanType["TTS"] = "tts";
41
+ /** STT transcription */
42
+ SpanType["STT"] = "stt";
41
43
  /** Server adapter request */
42
44
  SpanType["SERVER_REQUEST"] = "server.request";
43
45
  /** Custom span */
@@ -11,7 +11,8 @@ import type { ChatMessage } from "./conversation.js";
11
11
  import type { StreamNoOutputSentinel } from "./noOutputSentinel.js";
12
12
  import type { AdditionalMemoryUser } from "./generate.js";
13
13
  import type { AIModelProviderConfig, NeurolinkCredentials } from "./providers.js";
14
- import type { TTSChunk, TTSOptions } from "./tts.js";
14
+ import type { TTSChunk, TTSOptions, TTSResult } from "./tts.js";
15
+ import type { STTOptions, STTResult } from "./stt.js";
15
16
  import type { StandardRecord, ValidationSchema } from "./aliases.js";
16
17
  import type { FileWithMetadata } from "./file.js";
17
18
  import type { WorkflowConfig } from "./workflow.js";
@@ -127,9 +128,9 @@ export type AudioChunk = {
127
128
  * ```typescript
128
129
  * const audioBuffer: Buffer[] = [];
129
130
  * for await (const chunk of result.stream) {
130
- * if (chunk.type === "audio") {
131
- * audioBuffer.push(chunk.audioChunk.data); // TypeScript knows 'audioChunk' exists
132
- * if (chunk.audioChunk.isFinal) {
131
+ * if (chunk.type === "tts_audio") {
132
+ * audioBuffer.push(chunk.audio.data); // TypeScript knows 'audio' exists
133
+ * if (chunk.audio.isFinal) {
133
134
  * const fullAudio = Buffer.concat(audioBuffer);
134
135
  * fs.writeFileSync('output.mp3', fullAudio);
135
136
  * }
@@ -144,8 +145,8 @@ export type AudioChunk = {
144
145
  * case "text":
145
146
  * process.stdout.write(chunk.content);
146
147
  * break;
147
- * case "audio":
148
- * playAudioChunk(chunk.audioChunk.data);
148
+ * case "tts_audio":
149
+ * playAudioChunk(chunk.audio.data);
149
150
  * break;
150
151
  * }
151
152
  * }
@@ -157,10 +158,12 @@ export type StreamChunk = {
157
158
  /** Text content chunk */
158
159
  content: string;
159
160
  } | {
160
- /** Discriminator for audio chunks */
161
- type: "audio";
161
+ /** Discriminator for synthesized TTS audio chunks. Uses `tts_audio`
162
+ * (not `audio`) to avoid colliding with realtime AudioChunk and to
163
+ * match the runtime shape emitted by `BaseProvider.stream()`. */
164
+ type: "tts_audio";
162
165
  /** TTS audio chunk data */
163
- audioChunk: TTSChunk;
166
+ audio: TTSChunk;
164
167
  };
165
168
  export type StreamOptions = {
166
169
  input: {
@@ -226,9 +229,9 @@ export type StreamOptions = {
226
229
  * for await (const chunk of result.stream) {
227
230
  * if (chunk.type === "text") {
228
231
  * process.stdout.write(chunk.content);
229
- * } else if (chunk.type === "audio") {
232
+ * } else if (chunk.type === "tts_audio") {
230
233
  * // Handle audio chunk
231
- * playAudioChunk(chunk.audioChunk.data);
234
+ * playAudioChunk(chunk.audio.data);
232
235
  * }
233
236
  * }
234
237
  * ```
@@ -249,6 +252,15 @@ export type StreamOptions = {
249
252
  * ```
250
253
  */
251
254
  tts?: TTSOptions;
255
+ /**
256
+ * Speech-to-Text (STT) configuration for streaming
257
+ *
258
+ * When enabled, audio from `stt.audio` is transcribed before streaming begins.
259
+ */
260
+ stt?: STTOptions & {
261
+ provider?: string;
262
+ audio?: Buffer | ArrayBuffer;
263
+ };
252
264
  /**
253
265
  * Thinking/reasoning configuration for extended thinking models
254
266
  *
@@ -491,6 +503,9 @@ export type StreamResult = {
491
503
  } | StreamNoOutputSentinel | {
492
504
  type: "audio";
493
505
  audio: AudioChunk;
506
+ } | {
507
+ type: "tts_audio";
508
+ audio: TTSChunk;
494
509
  } | {
495
510
  type: "image";
496
511
  imageOutput: {
@@ -499,9 +514,6 @@ export type StreamResult = {
499
514
  } | {
500
515
  content: string;
501
516
  type?: "preliminary" | "final";
502
- } | {
503
- type: "audio";
504
- audio: AudioChunk;
505
517
  }>;
506
518
  provider?: string;
507
519
  model?: string;
@@ -566,6 +578,16 @@ export type StreamResult = {
566
578
  workflowId: string;
567
579
  workflowName: string;
568
580
  };
581
+ /** STT transcription result (when stt option is used) */
582
+ transcription?: STTResult;
583
+ /**
584
+ * TTS Mode 2 result (when `tts.enabled && tts.useAiResponse`).
585
+ * Resolves with the synthesized audio after the stream completes;
586
+ * resolves to undefined if TTS was not enabled or synthesis failed.
587
+ * The same audio is also yielded as a final chunk on `stream` for callers
588
+ * that prefer to consume it inline.
589
+ */
590
+ audio?: Promise<TTSResult | undefined>;
569
591
  };
570
592
  /**
571
593
  * Enhanced provider type with stream method