npm - @juspay/neurolink - Versions diffs - 9.61.1 → 9.62.0 - Mend

@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

package/CHANGELOG.md +12 -0
package/README.md +23 -17
package/dist/adapters/tts/googleTTSHandler.js +1 -1
package/dist/browser/neurolink.min.js +382 -364
package/dist/cli/commands/serve.js +9 -0
package/dist/cli/commands/voiceServer.d.ts +7 -0
package/dist/cli/commands/voiceServer.js +9 -1
package/dist/cli/factories/commandFactory.js +136 -11
package/dist/cli/loop/optionsSchema.d.ts +1 -1
package/dist/cli/utils/audioFileUtils.d.ts +3 -3
package/dist/cli/utils/audioFileUtils.js +5 -1
package/dist/core/baseProvider.js +29 -6
package/dist/factories/providerRegistry.d.ts +14 -0
package/dist/factories/providerRegistry.js +141 -2
package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
package/dist/lib/core/baseProvider.js +29 -6
package/dist/lib/factories/providerRegistry.d.ts +14 -0
package/dist/lib/factories/providerRegistry.js +141 -2
package/dist/lib/mcp/toolRegistry.js +7 -1
package/dist/lib/neurolink.d.ts +19 -0
package/dist/lib/neurolink.js +252 -14
package/dist/lib/observability/exporters/laminarExporter.js +1 -0
package/dist/lib/observability/exporters/posthogExporter.js +1 -0
package/dist/lib/observability/utils/spanSerializer.js +1 -0
package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
package/dist/lib/server/voice/tokenCompare.js +23 -0
package/dist/lib/server/voice/voiceServerApp.js +62 -3
package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
package/dist/lib/types/generate.d.ts +47 -0
package/dist/lib/types/hitl.d.ts +3 -0
package/dist/lib/types/index.d.ts +1 -1
package/dist/lib/types/index.js +1 -1
package/dist/lib/types/realtime.d.ts +243 -0
package/dist/lib/types/realtime.js +70 -0
package/dist/lib/types/server.d.ts +68 -0
package/dist/lib/types/span.d.ts +2 -0
package/dist/lib/types/span.js +2 -0
package/dist/lib/types/stream.d.ts +36 -14
package/dist/lib/types/stt.d.ts +585 -0
package/dist/lib/types/stt.js +90 -0
package/dist/lib/types/tools.d.ts +2 -0
package/dist/lib/types/tts.d.ts +23 -11
package/dist/lib/types/tts.js +7 -0
package/dist/lib/types/voice.d.ts +272 -0
package/dist/lib/types/voice.js +137 -0
package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
package/dist/lib/utils/audioFormatDetector.js +34 -0
package/dist/lib/utils/errorHandling.js +4 -0
package/dist/lib/utils/sttProcessor.d.ts +115 -0
package/dist/lib/utils/sttProcessor.js +295 -0
package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
package/dist/lib/voice/audio-utils.d.ts +135 -0
package/dist/lib/voice/audio-utils.js +435 -0
package/dist/lib/voice/errors.d.ts +123 -0
package/dist/lib/voice/errors.js +386 -0
package/dist/lib/voice/index.d.ts +26 -0
package/dist/lib/voice/index.js +55 -0
package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
package/dist/lib/voice/providers/AzureSTT.js +345 -0
package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
package/dist/lib/voice/providers/AzureTTS.js +349 -0
package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
package/dist/lib/voice/providers/GeminiLive.js +372 -0
package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
package/dist/lib/voice/providers/GoogleSTT.js +454 -0
package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
package/dist/lib/voice/providers/OpenAISTT.js +286 -0
package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
package/dist/lib/voice/providers/OpenAITTS.js +271 -0
package/dist/lib/voice/stream-handler.d.ts +166 -0
package/dist/lib/voice/stream-handler.js +514 -0
package/dist/mcp/toolRegistry.js +7 -1
package/dist/neurolink.d.ts +19 -0
package/dist/neurolink.js +252 -14
package/dist/observability/exporters/laminarExporter.js +1 -0
package/dist/observability/exporters/posthogExporter.js +1 -0
package/dist/observability/utils/spanSerializer.js +1 -0
package/dist/server/voice/tokenCompare.d.ts +14 -0
package/dist/server/voice/tokenCompare.js +22 -0
package/dist/server/voice/voiceServerApp.js +62 -3
package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
package/dist/server/voice/voiceWebSocketHandler.js +555 -435
package/dist/types/generate.d.ts +47 -0
package/dist/types/hitl.d.ts +3 -0
package/dist/types/index.d.ts +1 -1
package/dist/types/index.js +1 -1
package/dist/types/realtime.d.ts +243 -0
package/dist/types/realtime.js +69 -0
package/dist/types/server.d.ts +68 -0
package/dist/types/span.d.ts +2 -0
package/dist/types/span.js +2 -0
package/dist/types/stream.d.ts +36 -14
package/dist/types/stt.d.ts +585 -0
package/dist/types/stt.js +89 -0
package/dist/types/tools.d.ts +2 -0
package/dist/types/tts.d.ts +23 -11
package/dist/types/tts.js +7 -0
package/dist/types/voice.d.ts +272 -0
package/dist/types/voice.js +136 -0
package/dist/utils/audioFormatDetector.d.ts +15 -0
package/dist/utils/audioFormatDetector.js +33 -0
package/dist/utils/errorHandling.js +4 -0
package/dist/utils/sttProcessor.d.ts +115 -0
package/dist/utils/sttProcessor.js +294 -0
package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
package/dist/voice/RealtimeVoiceAPI.js +438 -0
package/dist/voice/audio-utils.d.ts +135 -0
package/dist/voice/audio-utils.js +434 -0
package/dist/voice/errors.d.ts +123 -0
package/dist/voice/errors.js +385 -0
package/dist/voice/index.d.ts +26 -0
package/dist/voice/index.js +54 -0
package/dist/voice/providers/AzureSTT.d.ts +47 -0
package/dist/voice/providers/AzureSTT.js +344 -0
package/dist/voice/providers/AzureTTS.d.ts +59 -0
package/dist/voice/providers/AzureTTS.js +348 -0
package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
package/dist/voice/providers/DeepgramSTT.js +549 -0
package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
package/dist/voice/providers/ElevenLabsTTS.js +310 -0
package/dist/voice/providers/GeminiLive.d.ts +52 -0
package/dist/voice/providers/GeminiLive.js +371 -0
package/dist/voice/providers/GoogleSTT.d.ts +60 -0
package/dist/voice/providers/GoogleSTT.js +453 -0
package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
package/dist/voice/providers/OpenAIRealtime.js +411 -0
package/dist/voice/providers/OpenAISTT.d.ts +41 -0
package/dist/voice/providers/OpenAISTT.js +285 -0
package/dist/voice/providers/OpenAITTS.d.ts +49 -0
package/dist/voice/providers/OpenAITTS.js +270 -0
package/dist/voice/stream-handler.d.ts +166 -0
package/dist/voice/stream-handler.js +513 -0
package/package.json +5 -2

package/dist/neurolink.d.ts CHANGED Viewed

@@ -764,6 +764,25 @@ export declare class NeuroLink {
     private validateStreamRequestOptions;
     private maybeHandleWorkflowStreamRequest;
     private runStandardStreamRequest;
+    /**
+     * TTS Mode 2 synthesis helper for the stream() pipeline.
+     *
+     * m5 — extracted from runStandardStreamRequest so the surrounding generator
+     * stays under the max-lines-per-function lint budget. Behaviour preserved
+     * exactly:
+     * - When Mode 2 is enabled (`tts.enabled && tts.useAiResponse`) AND the
+     *   model produced non-empty content: synthesises one final audio buffer
+     *   and returns it as an `audioChunk` for the caller to `yield`. Resolves
+     *   `ttsResolver` with the `TTSResult`.
+     * - When Mode 2 is enabled but synthesis fails: logs a warning and resolves
+     *   `ttsResolver` with `undefined`.
+     * - When Mode 2 is requested but skipped (empty content / wrong mode):
+     *   resolves `ttsResolver` with `undefined` early so callers awaiting
+     *   `result.audio` unblock before the surrounding `finally` cleanup
+     *   completes (Issue 7 latency micro-opt — the finally block also resolves
+     *   defensively, so this is a redundant early signal, not a coverage fix).
+     */
+    private synthesizeStreamModeTwo;
     /**
      * Prepare stream options: initialize memory, MCP, retrieval, orchestration,
      * Ollama tool auto-disable, factory processing, and tool detection.

package/dist/neurolink.js CHANGED Viewed

@@ -59,7 +59,6 @@ import { TaskManager } from "./tasks/taskManager.js";
 import { createTaskTools } from "./tasks/tools/taskTools.js";
 import { ATTR } from "./telemetry/attributes.js";
 import { tracers } from "./telemetry/tracers.js";
-// NEW: Generate function imports
 import { getConversationMessages, storeConversationTurn, } from "./utils/conversationMemory.js";
 // Enhanced error handling imports
 import { CircuitBreaker, ERROR_CODES, ErrorFactory, isAbortError, isRetriableError, logStructuredError, NeuroLinkError, withRetry, withTimeout, } from "./utils/errorHandling.js";
@@ -2933,7 +2932,15 @@ Current user's request: ${currentInput}`;
             ? optionsOrPrompt.length
             : options.input?.text?.length || 0);
         generateSpan.setAttribute("neurolink.has_tools", !!(options.tools && Object.keys(options.tools).length > 0));
-        this.assertInputText(options.input?.text, "Input text is required and must be a non-empty string");
+        // When STT audio is provided, ensure options.input exists (the transcription
+        // will supply the text inside runStandardGenerateRequest) and skip text validation.
+        const hasSttAudio = !!(options.stt?.enabled && options.stt?.audio);
+        if (hasSttAudio && !options.input) {
+            options.input = { text: "" };
+        }
+        if (!hasSttAudio) {
+            this.assertInputText(options.input?.text, "Input text is required and must be a non-empty string");
+        }
         this.enforceSessionBudget(options.maxBudgetUsd);
         this.applyGenerateLifecycleMiddleware(options);
         await this.applyAuthenticatedRequestContext(options);
@@ -2941,11 +2948,27 @@ Current user's request: ${currentInput}`;
     }
     async maybeHandleEarlyGenerateResult(options, generateSpan) {
         if (options.workflow || options.workflowConfig) {
+            if (options.stt?.enabled && options.stt?.audio) {
+                // prepareGenerateRequest synthesizes input.text = "" for audio-only
+                // calls, so without this guard generateWithWorkflow runs with an
+                // empty prompt. Fail fast when there's no text fallback.
+                if (!options.input?.text?.trim()) {
+                    throw new Error("STT audio is not supported with workflow mode without input.text");
+                }
+                logger.warn("[NeuroLink] STT audio preprocessing is not supported with workflow mode; audio will be ignored");
+            }
             return this.generateWithWorkflow(options);
         }
         if (options.output?.mode !== "ppt") {
             return null;
         }
+        if (options.stt?.enabled && options.stt?.audio) {
+            // Same fail-fast as the workflow branch — see comment above.
+            if (!options.input?.text?.trim()) {
+                throw new Error("STT audio is not supported with PPT mode without input.text");
+            }
+            logger.warn("[NeuroLink] STT audio preprocessing is not supported with PPT mode; audio will be ignored");
+        }
         const pptResult = await this.generateWithPPT(options);
         generateSpan.setAttribute("neurolink.output_length", pptResult.content?.length ?? 0);
         if (pptResult.analytics) {
@@ -2976,16 +2999,72 @@ Current user's request: ${currentInput}`;
         }
         await this.prepareGenerateAugmentations(options);
         const textOptions = await this.buildGenerateTextOptions(options, originalPrompt, factoryResult);
+        // STT preprocessing: transcribe audio input before LLM generation
+        let sttTranscription;
+        if (options.stt?.enabled && options.stt.audio) {
+            try {
+                // Always call — registerAllProviders() is idempotent via internal
+                // `registered` + `registrationPromise` deduplication. The previous
+                // isRegistered() guard short-circuited even when STT handler
+                // registration failed silently after AI providers were registered.
+                await ProviderRegistry.registerAllProviders();
+                const { STTProcessor } = await import("./utils/sttProcessor.js");
+                const sttProvider = options.stt.provider ?? "whisper";
+                sttTranscription = await STTProcessor.transcribe(options.stt.audio, sttProvider, options.stt);
+                // Inject transcription into the LLM prompt
+                if (sttTranscription.text) {
+                    const existingText = textOptions.prompt || textOptions.input?.text || "";
+                    if (!existingText) {
+                        // No user text — use transcription directly as the prompt
+                        textOptions.prompt = sttTranscription.text;
+                        if (textOptions.input) {
+                            textOptions.input.text = sttTranscription.text;
+                        }
+                    }
+                    else {
+                        // User provided text — prepend transcription as context
+                        const combined = `[Transcribed audio]: ${sttTranscription.text}\n\n${existingText}`;
+                        if (textOptions.prompt) {
+                            textOptions.prompt = combined;
+                        }
+                        if (textOptions.input?.text) {
+                            textOptions.input.text = combined;
+                        }
+                    }
+                }
+            }
+            catch (sttError) {
+                const existingText = textOptions.prompt || textOptions.input?.text || "";
+                if (!existingText) {
+                    // Audio-only request — no text to fall back to, fail fast
+                    throw sttError;
+                }
+                logger.warn(`[NeuroLink] STT transcription failed, falling back to text: ${sttError instanceof Error ? sttError.message : String(sttError)}`);
+            }
+        }
         const textResult = await this.generateTextInternal(textOptions);
-        return this.finalizeGenerateRequestResult({
+        // For STT-only calls, originalPrompt was captured before transcription.
+        // Use the transcribed text as the effective prompt for telemetry, memory,
+        // and trace attribution so traces don't show empty prompts.
+        const effectiveOriginalPrompt = sttTranscription?.text
+            ? originalPrompt
+                ? `[Transcribed audio]: ${sttTranscription.text}\n\n${originalPrompt}`
+                : sttTranscription.text
+            : originalPrompt;
+        // Attach STT transcription to result
+        const generateResult = this.finalizeGenerateRequestResult({
             generateSpan,
             options,
             textOptions,
             textResult,
             factoryResult,
-            originalPrompt,
+            originalPrompt: effectiveOriginalPrompt,
             startTime,
         });
+        if (sttTranscription) {
+            generateResult.transcription = sttTranscription;
+        }
+        return generateResult;
     }
     async maybeApplyGenerateOrchestration(options) {
         if (!this.enableOrchestration || options.provider || options.model) {
@@ -3080,6 +3159,7 @@ Current user's request: ${currentInput}`;
             input: options.input,
             region: options.region,
             tts: options.tts,
+            stt: options.stt,
             fileRegistry: this.fileRegistry,
             timeout: options.timeout,
             abortSignal: options.abortSignal,
@@ -3124,7 +3204,13 @@ Current user's request: ${currentInput}`;
             toolsUsed: textResult.toolsUsed,
             timestamp: Date.now(),
             result: textResult,
-            prompt: options.input?.text || options.prompt,
+            // Use the effective prompt (which already incorporates STT-transcribed
+            // text for audio-only calls) so observers see the real prompt instead
+            // of an empty string. Falls back through the same chain as before for
+            // text-only calls.
+            prompt: originalPrompt ||
+                options.input?.text ||
+                options.prompt,
             temperature: textOptions.temperature,
             maxTokens: textOptions.maxTokens,
             // A2 fix: Signal that Pipeline A (AI SDK → @langfuse/otel) already
@@ -3167,6 +3253,7 @@ Current user's request: ${currentInput}`;
                 }
                 : undefined,
             audio: textResult.audio,
+            transcription: textResult.transcription,
             video: textResult.video,
             ppt: textResult.ppt,
             ...(textResult.retries && { retries: textResult.retries }),
@@ -5088,7 +5175,38 @@ Current user's request: ${currentInput}`;
             const startTime = Date.now();
             const hrTimeStart = process.hrtime.bigint();
             const streamId = `neurolink-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
-            const originalPrompt = options.input.text;
+            // STT preprocessing for stream(): transcribe audio buffer (not realtime frames)
+            // and inject into the prompt before validation/execution. Mirrors generate().
+            const sttOptions = options.stt;
+            const sttAudio = sttOptions?.audio;
+            const hasStreamSttAudio = !!(sttOptions?.enabled && sttAudio);
+            let streamSttTranscription;
+            if (hasStreamSttAudio && sttOptions && sttAudio) {
+                if (!options.input) {
+                    options.input = { text: "" };
+                }
+                try {
+                    // registerAllProviders() is idempotent; always call.
+                    await ProviderRegistry.registerAllProviders();
+                    const { STTProcessor } = await import("./utils/sttProcessor.js");
+                    const sttProvider = sttOptions.provider ?? "whisper";
+                    streamSttTranscription = await STTProcessor.transcribe(sttAudio, sttProvider, sttOptions);
+                    if (streamSttTranscription.text) {
+                        const existingText = options.input.text || "";
+                        options.input.text = existingText
+                            ? `[Transcribed audio]: ${streamSttTranscription.text}\n\n${existingText}`
+                            : streamSttTranscription.text;
+                    }
+                }
+                catch (sttError) {
+                    const existingText = options.input.text || "";
+                    if (!existingText) {
+                        throw sttError;
+                    }
+                    logger.warn(`[NeuroLink] Stream STT transcription failed, falling back to text: ${sttError instanceof Error ? sttError.message : String(sttError)}`);
+                }
+            }
+            const originalPrompt = options.input?.text ?? "";
             options.fileRegistry = this.fileRegistry;
             await this.validateStreamRequestOptions(options, startTime);
             const workflowResult = await this.maybeHandleWorkflowStreamRequest({
@@ -5100,7 +5218,21 @@ Current user's request: ${currentInput}`;
             if (workflowResult) {
                 return workflowResult;
             }
-            return this.setLangfuseContextFromOptions(options, () => this.runStandardStreamRequest({
+            // TTS Mode 2 deferred: stream() emits text first, then synthesizes the
+            // accumulated response into a single audio chunk at end-of-stream and
+            // resolves `streamResult.audio` with the same TTSResult. The resolver is
+            // plumbed explicitly through the params bag (M11: previously a
+            // `_streamTtsResolve` cast on the caller's options object — fragile if
+            // the same options object was reused across concurrent stream() calls).
+            const ttsOptions = options.tts;
+            const wantsStreamTtsMode2 = !!(ttsOptions?.enabled && ttsOptions?.useAiResponse);
+            let resolveStreamTtsAudio;
+            const streamTtsAudioPromise = wantsStreamTtsMode2
+                ? new Promise((resolve) => {
+                    resolveStreamTtsAudio = resolve;
+                })
+                : undefined;
+            const streamResult = await this.setLangfuseContextFromOptions(options, () => this.runStandardStreamRequest({
                 options,
                 streamSpan,
                 spanStartTime,
@@ -5108,7 +5240,15 @@ Current user's request: ${currentInput}`;
                 hrTimeStart,
                 streamId,
                 originalPrompt,
+                ttsResolver: resolveStreamTtsAudio,
             }));
+            if (streamSttTranscription) {
+                streamResult.transcription = streamSttTranscription;
+            }
+            if (streamTtsAudioPromise) {
+                streamResult.audio = streamTtsAudioPromise;
+            }
+            return streamResult;
         }
         catch (error) {
             streamSpan.setStatus({
@@ -5159,7 +5299,7 @@ Current user's request: ${currentInput}`;
         return result;
     }
     async runStandardStreamRequest(params) {
-        const { options, streamSpan, spanStartTime, startTime, hrTimeStart, streamId, originalPrompt, } = params;
+        const { options, streamSpan, spanStartTime, startTime, hrTimeStart, streamId, originalPrompt, ttsResolver, } = params;
         logger.debug("[NeuroLink] Running standard stream request", {
             streamId,
             provider: options.provider,
@@ -5244,6 +5384,7 @@ Current user's request: ${currentInput}`;
                             typeof chunk === "object" &&
                             "type" in chunk &&
                             (chunk.type === "audio" ||
+                                chunk.type === "tts_audio" ||
                                 chunk.type === "image");
                         if (!isNoOutputSentinel && (hasTextContent || hasMediaPayload)) {
                             realOutputChunks++;
@@ -5278,6 +5419,22 @@ Current user's request: ${currentInput}`;
                             accumulatedContent += content;
                         });
                     }
+                    // TTS Mode 2 for stream(): synthesize the accumulated response
+                    // and yield ONE final audio chunk so callers iterating the stream
+                    // get the audio inline; also resolve `streamResult.audio` so the
+                    // ergonomic `await result.audio` pattern works post-iteration.
+                    // m5: synthesis logic lives in a dedicated helper to keep this
+                    // generator under the max-lines-per-function lint budget.
+                    const ttsModeResult = await self.synthesizeStreamModeTwo({
+                        ttsOptions: enhancedOptions.tts,
+                        providerName,
+                        fallbackProvider: enhancedOptions.provider,
+                        accumulatedContent,
+                        ttsResolver,
+                    });
+                    if (ttsModeResult.audioChunk) {
+                        yield ttsModeResult.audioChunk;
+                    }
                     resolvedUsage = streamUsage;
                     if (!resolvedUsage && streamAnalytics) {
                         try {
@@ -5343,6 +5500,14 @@ Current user's request: ${currentInput}`;
                     throw error;
                 }
                 finally {
+                    // Belt-and-braces: if TTS Mode 2 was requested but synthesis never
+                    // ran (stream errored before reaching the TTS block, or Mode 2 path
+                    // was skipped), resolve the audio promise to undefined so callers
+                    // awaiting `streamResult.audio` never hang. Uses the explicit
+                    // `ttsResolver` param (M11), not a side-channel cast.
+                    // m4: a duplicate resolution is a silent no-op — Promise resolvers
+                    // never throw, so no try/catch needed here.
+                    ttsResolver?.(undefined);
                     logger.debug("[NeuroLink.stream] Stream finished, performing cleanup", {
                         provider: providerName,
                         model: enhancedOptions.model,
@@ -5489,6 +5654,67 @@ Current user's request: ${currentInput}`;
             return this.handleStreamError(error, options, startTime, streamId, undefined, undefined);
         }
     }
+    /**
+     * TTS Mode 2 synthesis helper for the stream() pipeline.
+     *
+     * m5 — extracted from runStandardStreamRequest so the surrounding generator
+     * stays under the max-lines-per-function lint budget. Behaviour preserved
+     * exactly:
+     * - When Mode 2 is enabled (`tts.enabled && tts.useAiResponse`) AND the
+     *   model produced non-empty content: synthesises one final audio buffer
+     *   and returns it as an `audioChunk` for the caller to `yield`. Resolves
+     *   `ttsResolver` with the `TTSResult`.
+     * - When Mode 2 is enabled but synthesis fails: logs a warning and resolves
+     *   `ttsResolver` with `undefined`.
+     * - When Mode 2 is requested but skipped (empty content / wrong mode):
+     *   resolves `ttsResolver` with `undefined` early so callers awaiting
+     *   `result.audio` unblock before the surrounding `finally` cleanup
+     *   completes (Issue 7 latency micro-opt — the finally block also resolves
+     *   defensively, so this is a redundant early signal, not a coverage fix).
+     */
+    async synthesizeStreamModeTwo(params) {
+        const { ttsOptions, providerName, fallbackProvider, accumulatedContent, ttsResolver, } = params;
+        if (!ttsOptions?.enabled ||
+            !ttsOptions.useAiResponse ||
+            accumulatedContent.trim().length === 0) {
+            ttsResolver?.(undefined);
+            return {};
+        }
+        try {
+            const { TTSProcessor } = await import("./utils/ttsProcessor.js");
+            // ttsOptions.provider takes precedence; otherwise fall back to the
+            // chat provider ID ONLY when it happens to be a registered TTS handler
+            // (e.g. "google-ai" works for both LLM and TTS). For LLM-only IDs like
+            // "anthropic", we'd otherwise complete generation and then fail synth —
+            // surface that mismatch up front instead.
+            const candidate = ttsOptions.provider ?? fallbackProvider ?? providerName;
+            const ttsProvider = candidate && TTSProcessor.supports(candidate) ? candidate : undefined;
+            if (!ttsProvider) {
+                throw new Error(`No TTS provider resolved for stream Mode 2 (set tts.provider explicitly — chat provider "${candidate ?? "<unset>"}" is not a registered TTS handler)`);
+            }
+            const ttsResult = await TTSProcessor.synthesize(accumulatedContent, ttsProvider, ttsOptions);
+            ttsResolver?.(ttsResult);
+            return {
+                audioChunk: {
+                    type: "tts_audio",
+                    audio: {
+                        data: ttsResult.buffer,
+                        format: ttsResult.format,
+                        index: 0,
+                        isFinal: true,
+                        cumulativeSize: ttsResult.size,
+                        voice: ttsResult.voice,
+                        sampleRate: ttsResult.sampleRate,
+                    },
+                },
+            };
+        }
+        catch (ttsError) {
+            logger.warn(`[NeuroLink.stream] Stream TTS Mode 2 synthesis failed: ${ttsError instanceof Error ? ttsError.message : String(ttsError)}`);
+            ttsResolver?.(undefined);
+            return {};
+        }
+    }
     /**
      * Prepare stream options: initialize memory, MCP, retrieval, orchestration,
      * Ollama tool auto-disable, factory processing, and tool detection.
@@ -5519,8 +5745,15 @@ Current user's request: ${currentInput}`;
                     orchestratedModel: orchestratedOptions.model,
                     prompt: options.input.text?.substring(0, 100),
                 });
-                // Use orchestrated options
-                Object.assign(options, orchestratedOptions);
+                // Use orchestrated options — rebind the local `options` to a fresh
+                // merged object instead of mutating the caller-supplied one
+                // (NEW2: avoids cross-call contamination when callers reuse options).
+                // Issue 6: extract to an explicit local so the rebind intent is
+                // obvious to future readers, and the lint suppression is scoped
+                // narrowly to the one statement that actually rebinds the param.
+                const mergedOptions = { ...options, ...orchestratedOptions };
+                // eslint-disable-next-line no-param-reassign -- see NEW2/Issue 6 above
+                options = mergedOptions;
                 // Re-resolve model alias in case orchestration returned an alias
                 if (orchestratedOptions.model) {
                     options.model = resolveModel(options.model, this.modelAliasConfig);
@@ -5787,6 +6020,7 @@ Current user's request: ${currentInput}`;
                     typeof fallbackChunk === "object" &&
                     "type" in fallbackChunk &&
                     (fallbackChunk.type === "audio" ||
+                        fallbackChunk.type === "tts_audio" ||
                         fallbackChunk.type === "image");
                 if (!isFallbackNoOutputSentinel &&
                     (fallbackHasTextContent || fallbackHasMediaPayload)) {
@@ -5909,8 +6143,10 @@ Current user's request: ${currentInput}`;
         const hasAudio = !!(options?.input?.audio &&
             options.input.audio.frames &&
             typeof options.input.audio.frames[Symbol.asyncIterator] === "function");
-        if (!hasText && !hasAudio) {
-            throw new Error("Stream options must include either input.text or input.audio");
+        // STT pre-recorded audio buffer counts as input — transcription will fill text.
+        const hasSttAudio = !!(options?.stt?.enabled && options?.stt?.audio);
+        if (!hasText && !hasAudio && !hasSttAudio) {
+            throw new Error("Stream options must include either input.text, input.audio, or stt.audio");
         }
     }
     /**
@@ -7199,6 +7435,7 @@ Current user's request: ${currentInput}`;
             inputSize: inputStr.length,
             truncatedInput: inputStr.length > 2048 ? inputStr.substring(0, 2048) : inputStr,
             options,
+            hitlState: { triggered: false },
         };
     }
     async executeToolWithSpan(toolName, params, options, executionContext, toolSpan) {
@@ -7305,7 +7542,7 @@ Current user's request: ${currentInput}`;
                 circuitBreakerState: prepared.circuitBreaker.getState(),
             });
             const result = await prepared.circuitBreaker.execute(async () => {
-                return withRetry(async () => withTimeout(this.executeToolInternal(toolName, params, prepared.finalOptions), prepared.finalOptions.timeout, ErrorFactory.toolTimeout(toolName, prepared.finalOptions.timeout)), {
+                return withRetry(async () => withTimeout(this.executeToolInternal(toolName, params, prepared.finalOptions, executionContext.hitlState), prepared.finalOptions.timeout, ErrorFactory.toolTimeout(toolName, prepared.finalOptions.timeout)), {
                     maxAttempts: prepared.finalOptions.maxRetries + 1,
                     delayMs: prepared.finalOptions.retryDelayMs,
                     isRetriable: isRetriableError,
@@ -7526,7 +7763,7 @@ Current user's request: ${currentInput}`;
      * - Annotations: skip cache for destructive tools, retry safe tools on failure
      * - Middleware: apply global middleware chain before execution
      */
-    async executeToolInternal(toolName, params, options) {
+    async executeToolInternal(toolName, params, options, HITLState) {
         const functionTag = "NeuroLink.executeToolInternal";
         // === MCP ENHANCEMENT: Infer annotations for cache/retry decisions ===
         const toolAnnotations = this.getToolAnnotationsForExecution(toolName);
@@ -7645,6 +7882,7 @@ Current user's request: ${currentInput}`;
                 const context = {
                     ...storedContext,
                     ...passedAuthContext,
+                    hitlState: HITLState,
                 };
                 logger.debug(`[Using merged context for unified registry tool:`, {
                     toolName,

package/dist/observability/exporters/laminarExporter.js CHANGED Viewed

@@ -233,6 +233,7 @@ export class LaminarExporter extends BaseExporter {
             [SpanType.PPT_GENERATION]: "custom",
             [SpanType.WORKFLOW]: "workflow",
             [SpanType.TTS]: "custom",
+            [SpanType.STT]: "custom",
             [SpanType.SERVER_REQUEST]: "custom",
             [SpanType.CUSTOM]: "custom",
         };

package/dist/observability/exporters/posthogExporter.js CHANGED Viewed

@@ -216,6 +216,7 @@ export class PostHogExporter extends BaseExporter {
             [SpanType.PPT_GENERATION]: "ai_ppt_generation",
             [SpanType.WORKFLOW]: "ai_workflow",
             [SpanType.TTS]: "ai_tts_synthesis",
+            [SpanType.STT]: "ai_stt_transcription",
             [SpanType.SERVER_REQUEST]: "ai_server_request",
             [SpanType.CUSTOM]: "ai_custom_span",
         };

package/dist/observability/utils/spanSerializer.js CHANGED Viewed

@@ -234,6 +234,7 @@ export class SpanSerializer {
             [SpanType.PPT_GENERATION]: "chain",
             [SpanType.WORKFLOW]: "chain",
             [SpanType.TTS]: "chain",
+            [SpanType.STT]: "chain",
             [SpanType.SERVER_REQUEST]: "chain",
             [SpanType.CUSTOM]: "chain",
         };

package/dist/server/voice/tokenCompare.d.ts ADDED Viewed

@@ -0,0 +1,14 @@
+/**
+ * Constant-time bearer-token comparison.
+ *
+ * Bug 2 mitigation: a normal `===` compare on bearer tokens leaks the token
+ * length and the position of the first mismatching byte through timing
+ * differences, which is reachable when the voice server is bound publicly
+ * (`VOICE_SERVER_ALLOW_PUBLIC=1`).
+ *
+ * Returns `false` for any comparison whose lengths differ — this avoids the
+ * `RangeError` that `crypto.timingSafeEqual` throws on mismatched buffers
+ * while still preserving the constant-time property for equal-length inputs
+ * (which is the only case an attacker can probe).
+ */
+export declare function timingSafeEqualString(provided: string, expected: string): boolean;

package/dist/server/voice/tokenCompare.js ADDED Viewed

@@ -0,0 +1,22 @@
+import crypto from "crypto";
+/**
+ * Constant-time bearer-token comparison.
+ *
+ * Bug 2 mitigation: a normal `===` compare on bearer tokens leaks the token
+ * length and the position of the first mismatching byte through timing
+ * differences, which is reachable when the voice server is bound publicly
+ * (`VOICE_SERVER_ALLOW_PUBLIC=1`).
+ *
+ * Returns `false` for any comparison whose lengths differ — this avoids the
+ * `RangeError` that `crypto.timingSafeEqual` throws on mismatched buffers
+ * while still preserving the constant-time property for equal-length inputs
+ * (which is the only case an attacker can probe).
+ */
+export function timingSafeEqualString(provided, expected) {
+    const providedBuf = Buffer.from(provided, "utf8");
+    const expectedBuf = Buffer.from(expected, "utf8");
+    if (providedBuf.length !== expectedBuf.length) {
+        return false;
+    }
+    return crypto.timingSafeEqual(providedBuf, expectedBuf);
+}

package/dist/server/voice/voiceServerApp.js CHANGED Viewed

@@ -4,6 +4,7 @@ import http from "http";
 import path from "path";
 import { fileURLToPath } from "url";
 import { setupWebSocket } from "./voiceWebSocketHandler.js";
+import { timingSafeEqualString } from "./tokenCompare.js";
 import { NeuroLink } from "../../neurolink.js";
 import { logger } from "../../utils/logger.js";
 import { withTimeout } from "../../utils/async/withTimeout.js";
@@ -30,6 +31,50 @@ function resolvePublicPath() {
 }
 export async function startVoiceServer(port = 3000) {
     const app = express();
+    // NEW11: refuse to bind to non-loopback interfaces unless the operator
+    // has explicitly opted in. The voice server has minimal hardening and
+    // exposing it publicly without a token leaks Soniox / Cartesia / LLM
+    // credit usage to anyone who can reach the listener.
+    const allowPublic = process.env.VOICE_SERVER_ALLOW_PUBLIC === "1";
+    const host = allowPublic
+        ? (process.env.VOICE_SERVER_HOST ?? "0.0.0.0")
+        : "127.0.0.1";
+    // NEW11: optional shared-secret bearer token for both HTTP and WebSocket
+    // upgrade. When VOICE_SERVER_AUTH_TOKEN is set, every HTTP request must
+    // carry `Authorization: Bearer <token>`. The WS upgrade additionally
+    // accepts `?token=<token>` because browser WebSocket constructors cannot
+    // set custom headers — see voiceWebSocketHandler.verifyClient. HTTP routes
+    // intentionally reject `?token=` (would leak via Referer + access logs).
+    const authToken = process.env.VOICE_SERVER_AUTH_TOKEN;
+    /* ---------- BODY LIMITS + AUTH ---------- */
+    // NEW11: cap JSON / urlencoded body to 100kb. Express's default is 100kb
+    // for json() but only when explicitly registered; without this any future
+    // body parser would default to whatever its own limit is.
+    app.use(express.json({ limit: "100kb" }));
+    app.use(express.urlencoded({ limit: "100kb", extended: false }));
+    // NEW11: minimal HTTP auth middleware. Skips when no token is configured
+    // (back-compat — local-only dev keeps working). Skips for /health so
+    // load-balancers can probe without credentials.
+    if (authToken) {
+        app.use((req, res, next) => {
+            if (req.path === "/health") {
+                return next();
+            }
+            const header = req.header("authorization");
+            // Bug 3 fix: HTTP routes only accept the bearer header. The `?token=`
+            // fallback exists only on the WS upgrade where the browser API cannot
+            // attach headers — using it on regular HTTP would leak credentials via
+            // Referer headers, browser history, server access logs, and proxies.
+            const provided = header?.startsWith("Bearer ")
+                ? header.slice(7)
+                : undefined;
+            if (!provided || !timingSafeEqualString(provided, authToken)) {
+                res.status(401).json({ error: "Unauthorized" });
+                return;
+            }
+            next();
+        });
+    }
     /* ---------- STATIC FILES ---------- */
     const publicPath = resolvePublicPath();
     logger.info("[SERVER] Serving static from:", publicPath);
@@ -41,15 +86,29 @@ export async function startVoiceServer(port = 3000) {
     app.get("/health", (_, res) => {
         res.json({ status: "ok" });
     });
+    /* ---------- ERROR HANDLER ---------- */
+    // NEW11: global Express error handler so synchronous and async errors are
+    // caught instead of crashing the process or leaking stack traces.
+    app.use((err, _req, res, _next) => {
+        logger.error(`[SERVER] Unhandled error: ${err instanceof Error ? err.message : String(err)}`);
+        if (!res.headersSent) {
+            res.status(500).json({ error: "Internal server error" });
+        }
+    });
     const server = http.createServer(app);
     /* ---------- WS ---------- */
-    setupWebSocket(server);
+    // NEW11: pass the auth token + allow-public flag through to the WS handler
+    // so it can verify clients on upgrade and apply maxPayload caps.
+    setupWebSocket(server, { authToken, maxPayload: 1_048_576 });
     /* ---------- START ---------- */
     await new Promise((resolve, reject) => {
         server.once("error", reject);
-        server.listen(port, () => {
+        server.listen(port, host, () => {
             server.removeListener("error", reject);
-            logger.info(`[SERVER] Voice server running at http://localhost:${port}`);
+            const exposure = allowPublic
+                ? `bound publicly on ${host}:${port} (VOICE_SERVER_ALLOW_PUBLIC=1)`
+                : `bound to loopback ${host}:${port} (set VOICE_SERVER_ALLOW_PUBLIC=1 to expose externally)`;
+            logger.info(`[SERVER] Voice server running — ${exposure}${authToken ? " (auth required)" : " (no auth — token via VOICE_SERVER_AUTH_TOKEN recommended)"}`);
             resolve();
         });
     });

package/dist/server/voice/voiceWebSocketHandler.d.ts CHANGED Viewed

@@ -1,7 +1,24 @@
 import type { Server as HttpServer } from "http";
 /**
- * Call from the voice-server command handler BEFORE importing anything else
- * so the env change is scoped to voice mode only.
+ * Voice-server-mode environment configuration.
+ *
+ * @deprecated NEW12 — this used to mutate `process.env.NEUROLINK_DISABLE_MCP_TOOLS`
+ * which is process-wide. That broke any embedder that called this function in
+ * a process which ALSO used NeuroLink for non-voice work. The disable-tools
+ * intent is now passed explicitly via `disableTools: true` on every NeuroLink
+ * `generate()` / `stream()` call inside this server (see line ~167). Calling
+ * this function is now a no-op kept for backwards compatibility.
  */
 export declare function configureVoiceServerEnvironment(): void;
-export declare function setupWebSocket(server: HttpServer): void;
+/**
+ * Returns a copy of an outbound Soniox payload with the API key redacted.
+ *
+ * Use this whenever debug logging the auth frame — never JSON.stringify the
+ * raw object. (C3 mitigation: prevents the Soniox API key from leaking into
+ * any aggregated log sink even if a future debug statement serialises the
+ * outbound payload.)
+ */
+export declare function redactSonioxAuth<T extends {
+    api_key?: string;
+}>(payload: T): T;
+export declare function setupWebSocket(server: HttpServer, options?: import("../../types/index.js").ServerVoiceWebSocketOptions): void;