@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +23 -17
  3. package/dist/adapters/tts/googleTTSHandler.js +1 -1
  4. package/dist/browser/neurolink.min.js +382 -364
  5. package/dist/cli/commands/serve.js +9 -0
  6. package/dist/cli/commands/voiceServer.d.ts +7 -0
  7. package/dist/cli/commands/voiceServer.js +9 -1
  8. package/dist/cli/factories/commandFactory.js +136 -11
  9. package/dist/cli/loop/optionsSchema.d.ts +1 -1
  10. package/dist/cli/utils/audioFileUtils.d.ts +3 -3
  11. package/dist/cli/utils/audioFileUtils.js +5 -1
  12. package/dist/core/baseProvider.js +29 -6
  13. package/dist/factories/providerRegistry.d.ts +14 -0
  14. package/dist/factories/providerRegistry.js +141 -2
  15. package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
  16. package/dist/lib/core/baseProvider.js +29 -6
  17. package/dist/lib/factories/providerRegistry.d.ts +14 -0
  18. package/dist/lib/factories/providerRegistry.js +141 -2
  19. package/dist/lib/mcp/toolRegistry.js +7 -1
  20. package/dist/lib/neurolink.d.ts +19 -0
  21. package/dist/lib/neurolink.js +252 -14
  22. package/dist/lib/observability/exporters/laminarExporter.js +1 -0
  23. package/dist/lib/observability/exporters/posthogExporter.js +1 -0
  24. package/dist/lib/observability/utils/spanSerializer.js +1 -0
  25. package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
  26. package/dist/lib/server/voice/tokenCompare.js +23 -0
  27. package/dist/lib/server/voice/voiceServerApp.js +62 -3
  28. package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
  29. package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
  30. package/dist/lib/types/generate.d.ts +47 -0
  31. package/dist/lib/types/hitl.d.ts +3 -0
  32. package/dist/lib/types/index.d.ts +1 -1
  33. package/dist/lib/types/index.js +1 -1
  34. package/dist/lib/types/realtime.d.ts +243 -0
  35. package/dist/lib/types/realtime.js +70 -0
  36. package/dist/lib/types/server.d.ts +68 -0
  37. package/dist/lib/types/span.d.ts +2 -0
  38. package/dist/lib/types/span.js +2 -0
  39. package/dist/lib/types/stream.d.ts +36 -14
  40. package/dist/lib/types/stt.d.ts +585 -0
  41. package/dist/lib/types/stt.js +90 -0
  42. package/dist/lib/types/tools.d.ts +2 -0
  43. package/dist/lib/types/tts.d.ts +23 -11
  44. package/dist/lib/types/tts.js +7 -0
  45. package/dist/lib/types/voice.d.ts +272 -0
  46. package/dist/lib/types/voice.js +137 -0
  47. package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
  48. package/dist/lib/utils/audioFormatDetector.js +34 -0
  49. package/dist/lib/utils/errorHandling.js +4 -0
  50. package/dist/lib/utils/sttProcessor.d.ts +115 -0
  51. package/dist/lib/utils/sttProcessor.js +295 -0
  52. package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
  53. package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
  54. package/dist/lib/voice/audio-utils.d.ts +135 -0
  55. package/dist/lib/voice/audio-utils.js +435 -0
  56. package/dist/lib/voice/errors.d.ts +123 -0
  57. package/dist/lib/voice/errors.js +386 -0
  58. package/dist/lib/voice/index.d.ts +26 -0
  59. package/dist/lib/voice/index.js +55 -0
  60. package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
  61. package/dist/lib/voice/providers/AzureSTT.js +345 -0
  62. package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
  63. package/dist/lib/voice/providers/AzureTTS.js +349 -0
  64. package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
  65. package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
  66. package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
  67. package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
  68. package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
  69. package/dist/lib/voice/providers/GeminiLive.js +372 -0
  70. package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
  71. package/dist/lib/voice/providers/GoogleSTT.js +454 -0
  72. package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
  73. package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
  74. package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
  75. package/dist/lib/voice/providers/OpenAISTT.js +286 -0
  76. package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
  77. package/dist/lib/voice/providers/OpenAITTS.js +271 -0
  78. package/dist/lib/voice/stream-handler.d.ts +166 -0
  79. package/dist/lib/voice/stream-handler.js +514 -0
  80. package/dist/mcp/toolRegistry.js +7 -1
  81. package/dist/neurolink.d.ts +19 -0
  82. package/dist/neurolink.js +252 -14
  83. package/dist/observability/exporters/laminarExporter.js +1 -0
  84. package/dist/observability/exporters/posthogExporter.js +1 -0
  85. package/dist/observability/utils/spanSerializer.js +1 -0
  86. package/dist/server/voice/tokenCompare.d.ts +14 -0
  87. package/dist/server/voice/tokenCompare.js +22 -0
  88. package/dist/server/voice/voiceServerApp.js +62 -3
  89. package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
  90. package/dist/server/voice/voiceWebSocketHandler.js +555 -435
  91. package/dist/types/generate.d.ts +47 -0
  92. package/dist/types/hitl.d.ts +3 -0
  93. package/dist/types/index.d.ts +1 -1
  94. package/dist/types/index.js +1 -1
  95. package/dist/types/realtime.d.ts +243 -0
  96. package/dist/types/realtime.js +69 -0
  97. package/dist/types/server.d.ts +68 -0
  98. package/dist/types/span.d.ts +2 -0
  99. package/dist/types/span.js +2 -0
  100. package/dist/types/stream.d.ts +36 -14
  101. package/dist/types/stt.d.ts +585 -0
  102. package/dist/types/stt.js +89 -0
  103. package/dist/types/tools.d.ts +2 -0
  104. package/dist/types/tts.d.ts +23 -11
  105. package/dist/types/tts.js +7 -0
  106. package/dist/types/voice.d.ts +272 -0
  107. package/dist/types/voice.js +136 -0
  108. package/dist/utils/audioFormatDetector.d.ts +15 -0
  109. package/dist/utils/audioFormatDetector.js +33 -0
  110. package/dist/utils/errorHandling.js +4 -0
  111. package/dist/utils/sttProcessor.d.ts +115 -0
  112. package/dist/utils/sttProcessor.js +294 -0
  113. package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
  114. package/dist/voice/RealtimeVoiceAPI.js +438 -0
  115. package/dist/voice/audio-utils.d.ts +135 -0
  116. package/dist/voice/audio-utils.js +434 -0
  117. package/dist/voice/errors.d.ts +123 -0
  118. package/dist/voice/errors.js +385 -0
  119. package/dist/voice/index.d.ts +26 -0
  120. package/dist/voice/index.js +54 -0
  121. package/dist/voice/providers/AzureSTT.d.ts +47 -0
  122. package/dist/voice/providers/AzureSTT.js +344 -0
  123. package/dist/voice/providers/AzureTTS.d.ts +59 -0
  124. package/dist/voice/providers/AzureTTS.js +348 -0
  125. package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
  126. package/dist/voice/providers/DeepgramSTT.js +549 -0
  127. package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
  128. package/dist/voice/providers/ElevenLabsTTS.js +310 -0
  129. package/dist/voice/providers/GeminiLive.d.ts +52 -0
  130. package/dist/voice/providers/GeminiLive.js +371 -0
  131. package/dist/voice/providers/GoogleSTT.d.ts +60 -0
  132. package/dist/voice/providers/GoogleSTT.js +453 -0
  133. package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
  134. package/dist/voice/providers/OpenAIRealtime.js +411 -0
  135. package/dist/voice/providers/OpenAISTT.d.ts +41 -0
  136. package/dist/voice/providers/OpenAISTT.js +285 -0
  137. package/dist/voice/providers/OpenAITTS.d.ts +49 -0
  138. package/dist/voice/providers/OpenAITTS.js +270 -0
  139. package/dist/voice/stream-handler.d.ts +166 -0
  140. package/dist/voice/stream-handler.js +513 -0
  141. package/package.json +5 -2
@@ -59,7 +59,6 @@ import { TaskManager } from "./tasks/taskManager.js";
59
59
  import { createTaskTools } from "./tasks/tools/taskTools.js";
60
60
  import { ATTR } from "./telemetry/attributes.js";
61
61
  import { tracers } from "./telemetry/tracers.js";
62
- // NEW: Generate function imports
63
62
  import { getConversationMessages, storeConversationTurn, } from "./utils/conversationMemory.js";
64
63
  // Enhanced error handling imports
65
64
  import { CircuitBreaker, ERROR_CODES, ErrorFactory, isAbortError, isRetriableError, logStructuredError, NeuroLinkError, withRetry, withTimeout, } from "./utils/errorHandling.js";
@@ -2933,7 +2932,15 @@ Current user's request: ${currentInput}`;
2933
2932
  ? optionsOrPrompt.length
2934
2933
  : options.input?.text?.length || 0);
2935
2934
  generateSpan.setAttribute("neurolink.has_tools", !!(options.tools && Object.keys(options.tools).length > 0));
2936
- this.assertInputText(options.input?.text, "Input text is required and must be a non-empty string");
2935
+ // When STT audio is provided, ensure options.input exists (the transcription
2936
+ // will supply the text inside runStandardGenerateRequest) and skip text validation.
2937
+ const hasSttAudio = !!(options.stt?.enabled && options.stt?.audio);
2938
+ if (hasSttAudio && !options.input) {
2939
+ options.input = { text: "" };
2940
+ }
2941
+ if (!hasSttAudio) {
2942
+ this.assertInputText(options.input?.text, "Input text is required and must be a non-empty string");
2943
+ }
2937
2944
  this.enforceSessionBudget(options.maxBudgetUsd);
2938
2945
  this.applyGenerateLifecycleMiddleware(options);
2939
2946
  await this.applyAuthenticatedRequestContext(options);
@@ -2941,11 +2948,27 @@ Current user's request: ${currentInput}`;
2941
2948
  }
2942
2949
  async maybeHandleEarlyGenerateResult(options, generateSpan) {
2943
2950
  if (options.workflow || options.workflowConfig) {
2951
+ if (options.stt?.enabled && options.stt?.audio) {
2952
+ // prepareGenerateRequest synthesizes input.text = "" for audio-only
2953
+ // calls, so without this guard generateWithWorkflow runs with an
2954
+ // empty prompt. Fail fast when there's no text fallback.
2955
+ if (!options.input?.text?.trim()) {
2956
+ throw new Error("STT audio is not supported with workflow mode without input.text");
2957
+ }
2958
+ logger.warn("[NeuroLink] STT audio preprocessing is not supported with workflow mode; audio will be ignored");
2959
+ }
2944
2960
  return this.generateWithWorkflow(options);
2945
2961
  }
2946
2962
  if (options.output?.mode !== "ppt") {
2947
2963
  return null;
2948
2964
  }
2965
+ if (options.stt?.enabled && options.stt?.audio) {
2966
+ // Same fail-fast as the workflow branch — see comment above.
2967
+ if (!options.input?.text?.trim()) {
2968
+ throw new Error("STT audio is not supported with PPT mode without input.text");
2969
+ }
2970
+ logger.warn("[NeuroLink] STT audio preprocessing is not supported with PPT mode; audio will be ignored");
2971
+ }
2949
2972
  const pptResult = await this.generateWithPPT(options);
2950
2973
  generateSpan.setAttribute("neurolink.output_length", pptResult.content?.length ?? 0);
2951
2974
  if (pptResult.analytics) {
@@ -2976,16 +2999,72 @@ Current user's request: ${currentInput}`;
2976
2999
  }
2977
3000
  await this.prepareGenerateAugmentations(options);
2978
3001
  const textOptions = await this.buildGenerateTextOptions(options, originalPrompt, factoryResult);
3002
+ // STT preprocessing: transcribe audio input before LLM generation
3003
+ let sttTranscription;
3004
+ if (options.stt?.enabled && options.stt.audio) {
3005
+ try {
3006
+ // Always call — registerAllProviders() is idempotent via internal
3007
+ // `registered` + `registrationPromise` deduplication. The previous
3008
+ // isRegistered() guard short-circuited even when STT handler
3009
+ // registration failed silently after AI providers were registered.
3010
+ await ProviderRegistry.registerAllProviders();
3011
+ const { STTProcessor } = await import("./utils/sttProcessor.js");
3012
+ const sttProvider = options.stt.provider ?? "whisper";
3013
+ sttTranscription = await STTProcessor.transcribe(options.stt.audio, sttProvider, options.stt);
3014
+ // Inject transcription into the LLM prompt
3015
+ if (sttTranscription.text) {
3016
+ const existingText = textOptions.prompt || textOptions.input?.text || "";
3017
+ if (!existingText) {
3018
+ // No user text — use transcription directly as the prompt
3019
+ textOptions.prompt = sttTranscription.text;
3020
+ if (textOptions.input) {
3021
+ textOptions.input.text = sttTranscription.text;
3022
+ }
3023
+ }
3024
+ else {
3025
+ // User provided text — prepend transcription as context
3026
+ const combined = `[Transcribed audio]: ${sttTranscription.text}\n\n${existingText}`;
3027
+ if (textOptions.prompt) {
3028
+ textOptions.prompt = combined;
3029
+ }
3030
+ if (textOptions.input?.text) {
3031
+ textOptions.input.text = combined;
3032
+ }
3033
+ }
3034
+ }
3035
+ }
3036
+ catch (sttError) {
3037
+ const existingText = textOptions.prompt || textOptions.input?.text || "";
3038
+ if (!existingText) {
3039
+ // Audio-only request — no text to fall back to, fail fast
3040
+ throw sttError;
3041
+ }
3042
+ logger.warn(`[NeuroLink] STT transcription failed, falling back to text: ${sttError instanceof Error ? sttError.message : String(sttError)}`);
3043
+ }
3044
+ }
2979
3045
  const textResult = await this.generateTextInternal(textOptions);
2980
- return this.finalizeGenerateRequestResult({
3046
+ // For STT-only calls, originalPrompt was captured before transcription.
3047
+ // Use the transcribed text as the effective prompt for telemetry, memory,
3048
+ // and trace attribution so traces don't show empty prompts.
3049
+ const effectiveOriginalPrompt = sttTranscription?.text
3050
+ ? originalPrompt
3051
+ ? `[Transcribed audio]: ${sttTranscription.text}\n\n${originalPrompt}`
3052
+ : sttTranscription.text
3053
+ : originalPrompt;
3054
+ // Attach STT transcription to result
3055
+ const generateResult = this.finalizeGenerateRequestResult({
2981
3056
  generateSpan,
2982
3057
  options,
2983
3058
  textOptions,
2984
3059
  textResult,
2985
3060
  factoryResult,
2986
- originalPrompt,
3061
+ originalPrompt: effectiveOriginalPrompt,
2987
3062
  startTime,
2988
3063
  });
3064
+ if (sttTranscription) {
3065
+ generateResult.transcription = sttTranscription;
3066
+ }
3067
+ return generateResult;
2989
3068
  }
2990
3069
  async maybeApplyGenerateOrchestration(options) {
2991
3070
  if (!this.enableOrchestration || options.provider || options.model) {
@@ -3080,6 +3159,7 @@ Current user's request: ${currentInput}`;
3080
3159
  input: options.input,
3081
3160
  region: options.region,
3082
3161
  tts: options.tts,
3162
+ stt: options.stt,
3083
3163
  fileRegistry: this.fileRegistry,
3084
3164
  timeout: options.timeout,
3085
3165
  abortSignal: options.abortSignal,
@@ -3124,7 +3204,13 @@ Current user's request: ${currentInput}`;
3124
3204
  toolsUsed: textResult.toolsUsed,
3125
3205
  timestamp: Date.now(),
3126
3206
  result: textResult,
3127
- prompt: options.input?.text || options.prompt,
3207
+ // Use the effective prompt (which already incorporates STT-transcribed
3208
+ // text for audio-only calls) so observers see the real prompt instead
3209
+ // of an empty string. Falls back through the same chain as before for
3210
+ // text-only calls.
3211
+ prompt: originalPrompt ||
3212
+ options.input?.text ||
3213
+ options.prompt,
3128
3214
  temperature: textOptions.temperature,
3129
3215
  maxTokens: textOptions.maxTokens,
3130
3216
  // A2 fix: Signal that Pipeline A (AI SDK → @langfuse/otel) already
@@ -3167,6 +3253,7 @@ Current user's request: ${currentInput}`;
3167
3253
  }
3168
3254
  : undefined,
3169
3255
  audio: textResult.audio,
3256
+ transcription: textResult.transcription,
3170
3257
  video: textResult.video,
3171
3258
  ppt: textResult.ppt,
3172
3259
  ...(textResult.retries && { retries: textResult.retries }),
@@ -5088,7 +5175,38 @@ Current user's request: ${currentInput}`;
5088
5175
  const startTime = Date.now();
5089
5176
  const hrTimeStart = process.hrtime.bigint();
5090
5177
  const streamId = `neurolink-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
5091
- const originalPrompt = options.input.text;
5178
+ // STT preprocessing for stream(): transcribe audio buffer (not realtime frames)
5179
+ // and inject into the prompt before validation/execution. Mirrors generate().
5180
+ const sttOptions = options.stt;
5181
+ const sttAudio = sttOptions?.audio;
5182
+ const hasStreamSttAudio = !!(sttOptions?.enabled && sttAudio);
5183
+ let streamSttTranscription;
5184
+ if (hasStreamSttAudio && sttOptions && sttAudio) {
5185
+ if (!options.input) {
5186
+ options.input = { text: "" };
5187
+ }
5188
+ try {
5189
+ // registerAllProviders() is idempotent; always call.
5190
+ await ProviderRegistry.registerAllProviders();
5191
+ const { STTProcessor } = await import("./utils/sttProcessor.js");
5192
+ const sttProvider = sttOptions.provider ?? "whisper";
5193
+ streamSttTranscription = await STTProcessor.transcribe(sttAudio, sttProvider, sttOptions);
5194
+ if (streamSttTranscription.text) {
5195
+ const existingText = options.input.text || "";
5196
+ options.input.text = existingText
5197
+ ? `[Transcribed audio]: ${streamSttTranscription.text}\n\n${existingText}`
5198
+ : streamSttTranscription.text;
5199
+ }
5200
+ }
5201
+ catch (sttError) {
5202
+ const existingText = options.input.text || "";
5203
+ if (!existingText) {
5204
+ throw sttError;
5205
+ }
5206
+ logger.warn(`[NeuroLink] Stream STT transcription failed, falling back to text: ${sttError instanceof Error ? sttError.message : String(sttError)}`);
5207
+ }
5208
+ }
5209
+ const originalPrompt = options.input?.text ?? "";
5092
5210
  options.fileRegistry = this.fileRegistry;
5093
5211
  await this.validateStreamRequestOptions(options, startTime);
5094
5212
  const workflowResult = await this.maybeHandleWorkflowStreamRequest({
@@ -5100,7 +5218,21 @@ Current user's request: ${currentInput}`;
5100
5218
  if (workflowResult) {
5101
5219
  return workflowResult;
5102
5220
  }
5103
- return this.setLangfuseContextFromOptions(options, () => this.runStandardStreamRequest({
5221
+ // TTS Mode 2 deferred: stream() emits text first, then synthesizes the
5222
+ // accumulated response into a single audio chunk at end-of-stream and
5223
+ // resolves `streamResult.audio` with the same TTSResult. The resolver is
5224
+ // plumbed explicitly through the params bag (M11: previously a
5225
+ // `_streamTtsResolve` cast on the caller's options object — fragile if
5226
+ // the same options object was reused across concurrent stream() calls).
5227
+ const ttsOptions = options.tts;
5228
+ const wantsStreamTtsMode2 = !!(ttsOptions?.enabled && ttsOptions?.useAiResponse);
5229
+ let resolveStreamTtsAudio;
5230
+ const streamTtsAudioPromise = wantsStreamTtsMode2
5231
+ ? new Promise((resolve) => {
5232
+ resolveStreamTtsAudio = resolve;
5233
+ })
5234
+ : undefined;
5235
+ const streamResult = await this.setLangfuseContextFromOptions(options, () => this.runStandardStreamRequest({
5104
5236
  options,
5105
5237
  streamSpan,
5106
5238
  spanStartTime,
@@ -5108,7 +5240,15 @@ Current user's request: ${currentInput}`;
5108
5240
  hrTimeStart,
5109
5241
  streamId,
5110
5242
  originalPrompt,
5243
+ ttsResolver: resolveStreamTtsAudio,
5111
5244
  }));
5245
+ if (streamSttTranscription) {
5246
+ streamResult.transcription = streamSttTranscription;
5247
+ }
5248
+ if (streamTtsAudioPromise) {
5249
+ streamResult.audio = streamTtsAudioPromise;
5250
+ }
5251
+ return streamResult;
5112
5252
  }
5113
5253
  catch (error) {
5114
5254
  streamSpan.setStatus({
@@ -5159,7 +5299,7 @@ Current user's request: ${currentInput}`;
5159
5299
  return result;
5160
5300
  }
5161
5301
  async runStandardStreamRequest(params) {
5162
- const { options, streamSpan, spanStartTime, startTime, hrTimeStart, streamId, originalPrompt, } = params;
5302
+ const { options, streamSpan, spanStartTime, startTime, hrTimeStart, streamId, originalPrompt, ttsResolver, } = params;
5163
5303
  logger.debug("[NeuroLink] Running standard stream request", {
5164
5304
  streamId,
5165
5305
  provider: options.provider,
@@ -5244,6 +5384,7 @@ Current user's request: ${currentInput}`;
5244
5384
  typeof chunk === "object" &&
5245
5385
  "type" in chunk &&
5246
5386
  (chunk.type === "audio" ||
5387
+ chunk.type === "tts_audio" ||
5247
5388
  chunk.type === "image");
5248
5389
  if (!isNoOutputSentinel && (hasTextContent || hasMediaPayload)) {
5249
5390
  realOutputChunks++;
@@ -5278,6 +5419,22 @@ Current user's request: ${currentInput}`;
5278
5419
  accumulatedContent += content;
5279
5420
  });
5280
5421
  }
5422
+ // TTS Mode 2 for stream(): synthesize the accumulated response
5423
+ // and yield ONE final audio chunk so callers iterating the stream
5424
+ // get the audio inline; also resolve `streamResult.audio` so the
5425
+ // ergonomic `await result.audio` pattern works post-iteration.
5426
+ // m5: synthesis logic lives in a dedicated helper to keep this
5427
+ // generator under the max-lines-per-function lint budget.
5428
+ const ttsModeResult = await self.synthesizeStreamModeTwo({
5429
+ ttsOptions: enhancedOptions.tts,
5430
+ providerName,
5431
+ fallbackProvider: enhancedOptions.provider,
5432
+ accumulatedContent,
5433
+ ttsResolver,
5434
+ });
5435
+ if (ttsModeResult.audioChunk) {
5436
+ yield ttsModeResult.audioChunk;
5437
+ }
5281
5438
  resolvedUsage = streamUsage;
5282
5439
  if (!resolvedUsage && streamAnalytics) {
5283
5440
  try {
@@ -5343,6 +5500,14 @@ Current user's request: ${currentInput}`;
5343
5500
  throw error;
5344
5501
  }
5345
5502
  finally {
5503
+ // Belt-and-braces: if TTS Mode 2 was requested but synthesis never
5504
+ // ran (stream errored before reaching the TTS block, or Mode 2 path
5505
+ // was skipped), resolve the audio promise to undefined so callers
5506
+ // awaiting `streamResult.audio` never hang. Uses the explicit
5507
+ // `ttsResolver` param (M11), not a side-channel cast.
5508
+ // m4: a duplicate resolution is a silent no-op — Promise resolvers
5509
+ // never throw, so no try/catch needed here.
5510
+ ttsResolver?.(undefined);
5346
5511
  logger.debug("[NeuroLink.stream] Stream finished, performing cleanup", {
5347
5512
  provider: providerName,
5348
5513
  model: enhancedOptions.model,
@@ -5489,6 +5654,67 @@ Current user's request: ${currentInput}`;
5489
5654
  return this.handleStreamError(error, options, startTime, streamId, undefined, undefined);
5490
5655
  }
5491
5656
  }
5657
+ /**
5658
+ * TTS Mode 2 synthesis helper for the stream() pipeline.
5659
+ *
5660
+ * m5 — extracted from runStandardStreamRequest so the surrounding generator
5661
+ * stays under the max-lines-per-function lint budget. Behaviour preserved
5662
+ * exactly:
5663
+ * - When Mode 2 is enabled (`tts.enabled && tts.useAiResponse`) AND the
5664
+ * model produced non-empty content: synthesises one final audio buffer
5665
+ * and returns it as an `audioChunk` for the caller to `yield`. Resolves
5666
+ * `ttsResolver` with the `TTSResult`.
5667
+ * - When Mode 2 is enabled but synthesis fails: logs a warning and resolves
5668
+ * `ttsResolver` with `undefined`.
5669
+ * - When Mode 2 is requested but skipped (empty content / wrong mode):
5670
+ * resolves `ttsResolver` with `undefined` early so callers awaiting
5671
+ * `result.audio` unblock before the surrounding `finally` cleanup
5672
+ * completes (Issue 7 latency micro-opt — the finally block also resolves
5673
+ * defensively, so this is a redundant early signal, not a coverage fix).
5674
+ */
5675
+ async synthesizeStreamModeTwo(params) {
5676
+ const { ttsOptions, providerName, fallbackProvider, accumulatedContent, ttsResolver, } = params;
5677
+ if (!ttsOptions?.enabled ||
5678
+ !ttsOptions.useAiResponse ||
5679
+ accumulatedContent.trim().length === 0) {
5680
+ ttsResolver?.(undefined);
5681
+ return {};
5682
+ }
5683
+ try {
5684
+ const { TTSProcessor } = await import("./utils/ttsProcessor.js");
5685
+ // ttsOptions.provider takes precedence; otherwise fall back to the
5686
+ // chat provider ID ONLY when it happens to be a registered TTS handler
5687
+ // (e.g. "google-ai" works for both LLM and TTS). For LLM-only IDs like
5688
+ // "anthropic", we'd otherwise complete generation and then fail synth —
5689
+ // surface that mismatch up front instead.
5690
+ const candidate = ttsOptions.provider ?? fallbackProvider ?? providerName;
5691
+ const ttsProvider = candidate && TTSProcessor.supports(candidate) ? candidate : undefined;
5692
+ if (!ttsProvider) {
5693
+ throw new Error(`No TTS provider resolved for stream Mode 2 (set tts.provider explicitly — chat provider "${candidate ?? "<unset>"}" is not a registered TTS handler)`);
5694
+ }
5695
+ const ttsResult = await TTSProcessor.synthesize(accumulatedContent, ttsProvider, ttsOptions);
5696
+ ttsResolver?.(ttsResult);
5697
+ return {
5698
+ audioChunk: {
5699
+ type: "tts_audio",
5700
+ audio: {
5701
+ data: ttsResult.buffer,
5702
+ format: ttsResult.format,
5703
+ index: 0,
5704
+ isFinal: true,
5705
+ cumulativeSize: ttsResult.size,
5706
+ voice: ttsResult.voice,
5707
+ sampleRate: ttsResult.sampleRate,
5708
+ },
5709
+ },
5710
+ };
5711
+ }
5712
+ catch (ttsError) {
5713
+ logger.warn(`[NeuroLink.stream] Stream TTS Mode 2 synthesis failed: ${ttsError instanceof Error ? ttsError.message : String(ttsError)}`);
5714
+ ttsResolver?.(undefined);
5715
+ return {};
5716
+ }
5717
+ }
5492
5718
  /**
5493
5719
  * Prepare stream options: initialize memory, MCP, retrieval, orchestration,
5494
5720
  * Ollama tool auto-disable, factory processing, and tool detection.
@@ -5519,8 +5745,15 @@ Current user's request: ${currentInput}`;
5519
5745
  orchestratedModel: orchestratedOptions.model,
5520
5746
  prompt: options.input.text?.substring(0, 100),
5521
5747
  });
5522
- // Use orchestrated options
5523
- Object.assign(options, orchestratedOptions);
5748
+ // Use orchestrated options — rebind the local `options` to a fresh
5749
+ // merged object instead of mutating the caller-supplied one
5750
+ // (NEW2: avoids cross-call contamination when callers reuse options).
5751
+ // Issue 6: extract to an explicit local so the rebind intent is
5752
+ // obvious to future readers, and the lint suppression is scoped
5753
+ // narrowly to the one statement that actually rebinds the param.
5754
+ const mergedOptions = { ...options, ...orchestratedOptions };
5755
+ // eslint-disable-next-line no-param-reassign -- see NEW2/Issue 6 above
5756
+ options = mergedOptions;
5524
5757
  // Re-resolve model alias in case orchestration returned an alias
5525
5758
  if (orchestratedOptions.model) {
5526
5759
  options.model = resolveModel(options.model, this.modelAliasConfig);
@@ -5787,6 +6020,7 @@ Current user's request: ${currentInput}`;
5787
6020
  typeof fallbackChunk === "object" &&
5788
6021
  "type" in fallbackChunk &&
5789
6022
  (fallbackChunk.type === "audio" ||
6023
+ fallbackChunk.type === "tts_audio" ||
5790
6024
  fallbackChunk.type === "image");
5791
6025
  if (!isFallbackNoOutputSentinel &&
5792
6026
  (fallbackHasTextContent || fallbackHasMediaPayload)) {
@@ -5909,8 +6143,10 @@ Current user's request: ${currentInput}`;
5909
6143
  const hasAudio = !!(options?.input?.audio &&
5910
6144
  options.input.audio.frames &&
5911
6145
  typeof options.input.audio.frames[Symbol.asyncIterator] === "function");
5912
- if (!hasText && !hasAudio) {
5913
- throw new Error("Stream options must include either input.text or input.audio");
6146
+ // STT pre-recorded audio buffer counts as input — transcription will fill text.
6147
+ const hasSttAudio = !!(options?.stt?.enabled && options?.stt?.audio);
6148
+ if (!hasText && !hasAudio && !hasSttAudio) {
6149
+ throw new Error("Stream options must include either input.text, input.audio, or stt.audio");
5914
6150
  }
5915
6151
  }
5916
6152
  /**
@@ -7199,6 +7435,7 @@ Current user's request: ${currentInput}`;
7199
7435
  inputSize: inputStr.length,
7200
7436
  truncatedInput: inputStr.length > 2048 ? inputStr.substring(0, 2048) : inputStr,
7201
7437
  options,
7438
+ hitlState: { triggered: false },
7202
7439
  };
7203
7440
  }
7204
7441
  async executeToolWithSpan(toolName, params, options, executionContext, toolSpan) {
@@ -7305,7 +7542,7 @@ Current user's request: ${currentInput}`;
7305
7542
  circuitBreakerState: prepared.circuitBreaker.getState(),
7306
7543
  });
7307
7544
  const result = await prepared.circuitBreaker.execute(async () => {
7308
- return withRetry(async () => withTimeout(this.executeToolInternal(toolName, params, prepared.finalOptions), prepared.finalOptions.timeout, ErrorFactory.toolTimeout(toolName, prepared.finalOptions.timeout)), {
7545
+ return withRetry(async () => withTimeout(this.executeToolInternal(toolName, params, prepared.finalOptions, executionContext.hitlState), prepared.finalOptions.timeout, ErrorFactory.toolTimeout(toolName, prepared.finalOptions.timeout)), {
7309
7546
  maxAttempts: prepared.finalOptions.maxRetries + 1,
7310
7547
  delayMs: prepared.finalOptions.retryDelayMs,
7311
7548
  isRetriable: isRetriableError,
@@ -7526,7 +7763,7 @@ Current user's request: ${currentInput}`;
7526
7763
  * - Annotations: skip cache for destructive tools, retry safe tools on failure
7527
7764
  * - Middleware: apply global middleware chain before execution
7528
7765
  */
7529
- async executeToolInternal(toolName, params, options) {
7766
+ async executeToolInternal(toolName, params, options, HITLState) {
7530
7767
  const functionTag = "NeuroLink.executeToolInternal";
7531
7768
  // === MCP ENHANCEMENT: Infer annotations for cache/retry decisions ===
7532
7769
  const toolAnnotations = this.getToolAnnotationsForExecution(toolName);
@@ -7645,6 +7882,7 @@ Current user's request: ${currentInput}`;
7645
7882
  const context = {
7646
7883
  ...storedContext,
7647
7884
  ...passedAuthContext,
7885
+ hitlState: HITLState,
7648
7886
  };
7649
7887
  logger.debug(`[Using merged context for unified registry tool:`, {
7650
7888
  toolName,
@@ -233,6 +233,7 @@ export class LaminarExporter extends BaseExporter {
233
233
  [SpanType.PPT_GENERATION]: "custom",
234
234
  [SpanType.WORKFLOW]: "workflow",
235
235
  [SpanType.TTS]: "custom",
236
+ [SpanType.STT]: "custom",
236
237
  [SpanType.SERVER_REQUEST]: "custom",
237
238
  [SpanType.CUSTOM]: "custom",
238
239
  };
@@ -216,6 +216,7 @@ export class PostHogExporter extends BaseExporter {
216
216
  [SpanType.PPT_GENERATION]: "ai_ppt_generation",
217
217
  [SpanType.WORKFLOW]: "ai_workflow",
218
218
  [SpanType.TTS]: "ai_tts_synthesis",
219
+ [SpanType.STT]: "ai_stt_transcription",
219
220
  [SpanType.SERVER_REQUEST]: "ai_server_request",
220
221
  [SpanType.CUSTOM]: "ai_custom_span",
221
222
  };
@@ -234,6 +234,7 @@ export class SpanSerializer {
234
234
  [SpanType.PPT_GENERATION]: "chain",
235
235
  [SpanType.WORKFLOW]: "chain",
236
236
  [SpanType.TTS]: "chain",
237
+ [SpanType.STT]: "chain",
237
238
  [SpanType.SERVER_REQUEST]: "chain",
238
239
  [SpanType.CUSTOM]: "chain",
239
240
  };
@@ -0,0 +1,14 @@
1
+ /**
2
+ * Constant-time bearer-token comparison.
3
+ *
4
+ * Bug 2 mitigation: a normal `===` compare on bearer tokens leaks the token
5
+ * length and the position of the first mismatching byte through timing
6
+ * differences, which is reachable when the voice server is bound publicly
7
+ * (`VOICE_SERVER_ALLOW_PUBLIC=1`).
8
+ *
9
+ * Returns `false` for any comparison whose lengths differ — this avoids the
10
+ * `RangeError` that `crypto.timingSafeEqual` throws on mismatched buffers
11
+ * while still preserving the constant-time property for equal-length inputs
12
+ * (which is the only case an attacker can probe).
13
+ */
14
+ export declare function timingSafeEqualString(provided: string, expected: string): boolean;
@@ -0,0 +1,23 @@
1
+ import crypto from "crypto";
2
+ /**
3
+ * Constant-time bearer-token comparison.
4
+ *
5
+ * Bug 2 mitigation: a normal `===` compare on bearer tokens leaks the token
6
+ * length and the position of the first mismatching byte through timing
7
+ * differences, which is reachable when the voice server is bound publicly
8
+ * (`VOICE_SERVER_ALLOW_PUBLIC=1`).
9
+ *
10
+ * Returns `false` for any comparison whose lengths differ — this avoids the
11
+ * `RangeError` that `crypto.timingSafeEqual` throws on mismatched buffers
12
+ * while still preserving the constant-time property for equal-length inputs
13
+ * (which is the only case an attacker can probe).
14
+ */
15
+ export function timingSafeEqualString(provided, expected) {
16
+ const providedBuf = Buffer.from(provided, "utf8");
17
+ const expectedBuf = Buffer.from(expected, "utf8");
18
+ if (providedBuf.length !== expectedBuf.length) {
19
+ return false;
20
+ }
21
+ return crypto.timingSafeEqual(providedBuf, expectedBuf);
22
+ }
23
+ //# sourceMappingURL=tokenCompare.js.map
@@ -4,6 +4,7 @@ import http from "http";
4
4
  import path from "path";
5
5
  import { fileURLToPath } from "url";
6
6
  import { setupWebSocket } from "./voiceWebSocketHandler.js";
7
+ import { timingSafeEqualString } from "./tokenCompare.js";
7
8
  import { NeuroLink } from "../../neurolink.js";
8
9
  import { logger } from "../../utils/logger.js";
9
10
  import { withTimeout } from "../../utils/async/withTimeout.js";
@@ -30,6 +31,50 @@ function resolvePublicPath() {
30
31
  }
31
32
  export async function startVoiceServer(port = 3000) {
32
33
  const app = express();
34
+ // NEW11: refuse to bind to non-loopback interfaces unless the operator
35
+ // has explicitly opted in. The voice server has minimal hardening and
36
+ // exposing it publicly without a token leaks Soniox / Cartesia / LLM
37
+ // credit usage to anyone who can reach the listener.
38
+ const allowPublic = process.env.VOICE_SERVER_ALLOW_PUBLIC === "1";
39
+ const host = allowPublic
40
+ ? (process.env.VOICE_SERVER_HOST ?? "0.0.0.0")
41
+ : "127.0.0.1";
42
+ // NEW11: optional shared-secret bearer token for both HTTP and WebSocket
43
+ // upgrade. When VOICE_SERVER_AUTH_TOKEN is set, every HTTP request must
44
+ // carry `Authorization: Bearer <token>`. The WS upgrade additionally
45
+ // accepts `?token=<token>` because browser WebSocket constructors cannot
46
+ // set custom headers — see voiceWebSocketHandler.verifyClient. HTTP routes
47
+ // intentionally reject `?token=` (would leak via Referer + access logs).
48
+ const authToken = process.env.VOICE_SERVER_AUTH_TOKEN;
49
+ /* ---------- BODY LIMITS + AUTH ---------- */
50
+ // NEW11: cap JSON / urlencoded body to 100kb. Express's default is 100kb
51
+ // for json() but only when explicitly registered; without this any future
52
+ // body parser would default to whatever its own limit is.
53
+ app.use(express.json({ limit: "100kb" }));
54
+ app.use(express.urlencoded({ limit: "100kb", extended: false }));
55
+ // NEW11: minimal HTTP auth middleware. Skips when no token is configured
56
+ // (back-compat — local-only dev keeps working). Skips for /health so
57
+ // load-balancers can probe without credentials.
58
+ if (authToken) {
59
+ app.use((req, res, next) => {
60
+ if (req.path === "/health") {
61
+ return next();
62
+ }
63
+ const header = req.header("authorization");
64
+ // Bug 3 fix: HTTP routes only accept the bearer header. The `?token=`
65
+ // fallback exists only on the WS upgrade where the browser API cannot
66
+ // attach headers — using it on regular HTTP would leak credentials via
67
+ // Referer headers, browser history, server access logs, and proxies.
68
+ const provided = header?.startsWith("Bearer ")
69
+ ? header.slice(7)
70
+ : undefined;
71
+ if (!provided || !timingSafeEqualString(provided, authToken)) {
72
+ res.status(401).json({ error: "Unauthorized" });
73
+ return;
74
+ }
75
+ next();
76
+ });
77
+ }
33
78
  /* ---------- STATIC FILES ---------- */
34
79
  const publicPath = resolvePublicPath();
35
80
  logger.info("[SERVER] Serving static from:", publicPath);
@@ -41,15 +86,29 @@ export async function startVoiceServer(port = 3000) {
41
86
  app.get("/health", (_, res) => {
42
87
  res.json({ status: "ok" });
43
88
  });
89
+ /* ---------- ERROR HANDLER ---------- */
90
+ // NEW11: global Express error handler so synchronous and async errors are
91
+ // caught instead of crashing the process or leaking stack traces.
92
+ app.use((err, _req, res, _next) => {
93
+ logger.error(`[SERVER] Unhandled error: ${err instanceof Error ? err.message : String(err)}`);
94
+ if (!res.headersSent) {
95
+ res.status(500).json({ error: "Internal server error" });
96
+ }
97
+ });
44
98
  const server = http.createServer(app);
45
99
  /* ---------- WS ---------- */
46
- setupWebSocket(server);
100
+ // NEW11: pass the auth token + allow-public flag through to the WS handler
101
+ // so it can verify clients on upgrade and apply maxPayload caps.
102
+ setupWebSocket(server, { authToken, maxPayload: 1_048_576 });
47
103
  /* ---------- START ---------- */
48
104
  await new Promise((resolve, reject) => {
49
105
  server.once("error", reject);
50
- server.listen(port, () => {
106
+ server.listen(port, host, () => {
51
107
  server.removeListener("error", reject);
52
- logger.info(`[SERVER] Voice server running at http://localhost:${port}`);
108
+ const exposure = allowPublic
109
+ ? `bound publicly on ${host}:${port} (VOICE_SERVER_ALLOW_PUBLIC=1)`
110
+ : `bound to loopback ${host}:${port} (set VOICE_SERVER_ALLOW_PUBLIC=1 to expose externally)`;
111
+ logger.info(`[SERVER] Voice server running — ${exposure}${authToken ? " (auth required)" : " (no auth — token via VOICE_SERVER_AUTH_TOKEN recommended)"}`);
53
112
  resolve();
54
113
  });
55
114
  });
@@ -1,7 +1,24 @@
1
1
  import type { Server as HttpServer } from "http";
2
2
  /**
3
- * Call from the voice-server command handler BEFORE importing anything else
4
- * so the env change is scoped to voice mode only.
3
+ * Voice-server-mode environment configuration.
4
+ *
5
+ * @deprecated NEW12 — this used to mutate `process.env.NEUROLINK_DISABLE_MCP_TOOLS`
6
+ * which is process-wide. That broke any embedder that called this function in
7
+ * a process which ALSO used NeuroLink for non-voice work. The disable-tools
8
+ * intent is now passed explicitly via `disableTools: true` on every NeuroLink
9
+ * `generate()` / `stream()` call inside this server (see line ~167). Calling
10
+ * this function is now a no-op kept for backwards compatibility.
5
11
  */
6
12
  export declare function configureVoiceServerEnvironment(): void;
7
- export declare function setupWebSocket(server: HttpServer): void;
13
+ /**
14
+ * Returns a copy of an outbound Soniox payload with the API key redacted.
15
+ *
16
+ * Use this whenever debug logging the auth frame — never JSON.stringify the
17
+ * raw object. (C3 mitigation: prevents the Soniox API key from leaking into
18
+ * any aggregated log sink even if a future debug statement serialises the
19
+ * outbound payload.)
20
+ */
21
+ export declare function redactSonioxAuth<T extends {
22
+ api_key?: string;
23
+ }>(payload: T): T;
24
+ export declare function setupWebSocket(server: HttpServer, options?: import("../../types/index.js").ServerVoiceWebSocketOptions): void;