npm - @juspay/neurolink - Versions diffs - 9.61.1 → 9.62.0 - Mend

@juspay/neurolink 9.61.1 → 9.62.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

package/CHANGELOG.md +12 -0
package/README.md +23 -17
package/dist/adapters/tts/googleTTSHandler.js +1 -1
package/dist/browser/neurolink.min.js +382 -364
package/dist/cli/commands/serve.js +9 -0
package/dist/cli/commands/voiceServer.d.ts +7 -0
package/dist/cli/commands/voiceServer.js +9 -1
package/dist/cli/factories/commandFactory.js +136 -11
package/dist/cli/loop/optionsSchema.d.ts +1 -1
package/dist/cli/utils/audioFileUtils.d.ts +3 -3
package/dist/cli/utils/audioFileUtils.js +5 -1
package/dist/core/baseProvider.js +29 -6
package/dist/factories/providerRegistry.d.ts +14 -0
package/dist/factories/providerRegistry.js +141 -2
package/dist/lib/adapters/tts/googleTTSHandler.js +1 -1
package/dist/lib/core/baseProvider.js +29 -6
package/dist/lib/factories/providerRegistry.d.ts +14 -0
package/dist/lib/factories/providerRegistry.js +141 -2
package/dist/lib/mcp/toolRegistry.js +7 -1
package/dist/lib/neurolink.d.ts +19 -0
package/dist/lib/neurolink.js +252 -14
package/dist/lib/observability/exporters/laminarExporter.js +1 -0
package/dist/lib/observability/exporters/posthogExporter.js +1 -0
package/dist/lib/observability/utils/spanSerializer.js +1 -0
package/dist/lib/server/voice/tokenCompare.d.ts +14 -0
package/dist/lib/server/voice/tokenCompare.js +23 -0
package/dist/lib/server/voice/voiceServerApp.js +62 -3
package/dist/lib/server/voice/voiceWebSocketHandler.d.ts +20 -3
package/dist/lib/server/voice/voiceWebSocketHandler.js +555 -435
package/dist/lib/types/generate.d.ts +47 -0
package/dist/lib/types/hitl.d.ts +3 -0
package/dist/lib/types/index.d.ts +1 -1
package/dist/lib/types/index.js +1 -1
package/dist/lib/types/realtime.d.ts +243 -0
package/dist/lib/types/realtime.js +70 -0
package/dist/lib/types/server.d.ts +68 -0
package/dist/lib/types/span.d.ts +2 -0
package/dist/lib/types/span.js +2 -0
package/dist/lib/types/stream.d.ts +36 -14
package/dist/lib/types/stt.d.ts +585 -0
package/dist/lib/types/stt.js +90 -0
package/dist/lib/types/tools.d.ts +2 -0
package/dist/lib/types/tts.d.ts +23 -11
package/dist/lib/types/tts.js +7 -0
package/dist/lib/types/voice.d.ts +272 -0
package/dist/lib/types/voice.js +137 -0
package/dist/lib/utils/audioFormatDetector.d.ts +15 -0
package/dist/lib/utils/audioFormatDetector.js +34 -0
package/dist/lib/utils/errorHandling.js +4 -0
package/dist/lib/utils/sttProcessor.d.ts +115 -0
package/dist/lib/utils/sttProcessor.js +295 -0
package/dist/lib/voice/RealtimeVoiceAPI.d.ts +183 -0
package/dist/lib/voice/RealtimeVoiceAPI.js +439 -0
package/dist/lib/voice/audio-utils.d.ts +135 -0
package/dist/lib/voice/audio-utils.js +435 -0
package/dist/lib/voice/errors.d.ts +123 -0
package/dist/lib/voice/errors.js +386 -0
package/dist/lib/voice/index.d.ts +26 -0
package/dist/lib/voice/index.js +55 -0
package/dist/lib/voice/providers/AzureSTT.d.ts +47 -0
package/dist/lib/voice/providers/AzureSTT.js +345 -0
package/dist/lib/voice/providers/AzureTTS.d.ts +59 -0
package/dist/lib/voice/providers/AzureTTS.js +349 -0
package/dist/lib/voice/providers/DeepgramSTT.d.ts +40 -0
package/dist/lib/voice/providers/DeepgramSTT.js +550 -0
package/dist/lib/voice/providers/ElevenLabsTTS.d.ts +53 -0
package/dist/lib/voice/providers/ElevenLabsTTS.js +311 -0
package/dist/lib/voice/providers/GeminiLive.d.ts +52 -0
package/dist/lib/voice/providers/GeminiLive.js +372 -0
package/dist/lib/voice/providers/GoogleSTT.d.ts +60 -0
package/dist/lib/voice/providers/GoogleSTT.js +454 -0
package/dist/lib/voice/providers/OpenAIRealtime.d.ts +47 -0
package/dist/lib/voice/providers/OpenAIRealtime.js +412 -0
package/dist/lib/voice/providers/OpenAISTT.d.ts +41 -0
package/dist/lib/voice/providers/OpenAISTT.js +286 -0
package/dist/lib/voice/providers/OpenAITTS.d.ts +49 -0
package/dist/lib/voice/providers/OpenAITTS.js +271 -0
package/dist/lib/voice/stream-handler.d.ts +166 -0
package/dist/lib/voice/stream-handler.js +514 -0
package/dist/mcp/toolRegistry.js +7 -1
package/dist/neurolink.d.ts +19 -0
package/dist/neurolink.js +252 -14
package/dist/observability/exporters/laminarExporter.js +1 -0
package/dist/observability/exporters/posthogExporter.js +1 -0
package/dist/observability/utils/spanSerializer.js +1 -0
package/dist/server/voice/tokenCompare.d.ts +14 -0
package/dist/server/voice/tokenCompare.js +22 -0
package/dist/server/voice/voiceServerApp.js +62 -3
package/dist/server/voice/voiceWebSocketHandler.d.ts +20 -3
package/dist/server/voice/voiceWebSocketHandler.js +555 -435
package/dist/types/generate.d.ts +47 -0
package/dist/types/hitl.d.ts +3 -0
package/dist/types/index.d.ts +1 -1
package/dist/types/index.js +1 -1
package/dist/types/realtime.d.ts +243 -0
package/dist/types/realtime.js +69 -0
package/dist/types/server.d.ts +68 -0
package/dist/types/span.d.ts +2 -0
package/dist/types/span.js +2 -0
package/dist/types/stream.d.ts +36 -14
package/dist/types/stt.d.ts +585 -0
package/dist/types/stt.js +89 -0
package/dist/types/tools.d.ts +2 -0
package/dist/types/tts.d.ts +23 -11
package/dist/types/tts.js +7 -0
package/dist/types/voice.d.ts +272 -0
package/dist/types/voice.js +136 -0
package/dist/utils/audioFormatDetector.d.ts +15 -0
package/dist/utils/audioFormatDetector.js +33 -0
package/dist/utils/errorHandling.js +4 -0
package/dist/utils/sttProcessor.d.ts +115 -0
package/dist/utils/sttProcessor.js +294 -0
package/dist/voice/RealtimeVoiceAPI.d.ts +183 -0
package/dist/voice/RealtimeVoiceAPI.js +438 -0
package/dist/voice/audio-utils.d.ts +135 -0
package/dist/voice/audio-utils.js +434 -0
package/dist/voice/errors.d.ts +123 -0
package/dist/voice/errors.js +385 -0
package/dist/voice/index.d.ts +26 -0
package/dist/voice/index.js +54 -0
package/dist/voice/providers/AzureSTT.d.ts +47 -0
package/dist/voice/providers/AzureSTT.js +344 -0
package/dist/voice/providers/AzureTTS.d.ts +59 -0
package/dist/voice/providers/AzureTTS.js +348 -0
package/dist/voice/providers/DeepgramSTT.d.ts +40 -0
package/dist/voice/providers/DeepgramSTT.js +549 -0
package/dist/voice/providers/ElevenLabsTTS.d.ts +53 -0
package/dist/voice/providers/ElevenLabsTTS.js +310 -0
package/dist/voice/providers/GeminiLive.d.ts +52 -0
package/dist/voice/providers/GeminiLive.js +371 -0
package/dist/voice/providers/GoogleSTT.d.ts +60 -0
package/dist/voice/providers/GoogleSTT.js +453 -0
package/dist/voice/providers/OpenAIRealtime.d.ts +47 -0
package/dist/voice/providers/OpenAIRealtime.js +411 -0
package/dist/voice/providers/OpenAISTT.d.ts +41 -0
package/dist/voice/providers/OpenAISTT.js +285 -0
package/dist/voice/providers/OpenAITTS.d.ts +49 -0
package/dist/voice/providers/OpenAITTS.js +270 -0
package/dist/voice/stream-handler.d.ts +166 -0
package/dist/voice/stream-handler.js +513 -0
package/package.json +5 -2

package/dist/lib/types/generate.d.ts CHANGED Viewed

@@ -10,6 +10,7 @@ import type { MiddlewareFactoryOptions, OnFinishCallback, OnErrorCallback } from
 import type { DirectorModeOptions, DirectorSegment, VideoGenerationResult, VideoOutputOptions } from "./multimodal.js";
 import type { PPTGenerationResult, PPTOutputOptions } from "./ppt.js";
 import type { TTSOptions, TTSResult } from "./tts.js";
+import type { STTOptions, STTResult } from "./stt.js";
 import type { StandardRecord, ValidationSchema, ZodUnknownSchema } from "./aliases.js";
 import type { NeurolinkCredentials } from "./providers.js";
 import type { FileWithMetadata } from "./file.js";
@@ -142,6 +143,27 @@ export type GenerateOptions = {
      * ```
      */
     tts?: TTSOptions;
+    /**
+     * Speech-to-Text (STT) configuration
+     *
+     * Enable audio transcription. When enabled, the audio provided via `stt.audio`
+     * will be transcribed to text and used as the prompt.
+     *
+     * @example
+     * ```typescript
+     * const neurolink = new NeuroLink();
+     * const result = await neurolink.generate({
+     *   input: { text: "" },
+     *   provider: "openai",
+     *   stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
+     * });
+     * // STT transcribes the audio, result.transcription contains the transcription
+     * ```
+     */
+    stt?: STTOptions & {
+        provider?: string;
+        audio?: Buffer | ArrayBuffer;
+    };
     /**
      * Thinking/reasoning configuration for extended thinking models
      *
@@ -660,6 +682,8 @@ export type GenerateResult = {
     reasoning?: string;
     /** Token count for reasoning content */
     reasoningTokens?: number;
+    /** STT transcription result (present when stt.enabled is true and audio input was provided) */
+    transcription?: STTResult;
     retries?: {
         count: number;
         errors: Array<{
@@ -868,6 +892,27 @@ export type TextGenerationOptions = {
      * ```
      */
     tts?: TTSOptions;
+    /**
+     * Speech-to-Text (STT) configuration
+     *
+     * Enable audio transcription. When enabled, the audio provided via `stt.audio`
+     * will be transcribed to text and used as the prompt.
+     *
+     * @example
+     * ```typescript
+     * const neurolink = new NeuroLink();
+     * const result = await neurolink.generate({
+     *   input: { text: "" },
+     *   provider: "openai",
+     *   stt: { enabled: true, provider: "whisper", language: "en-US", audio: audioBuffer }
+     * });
+     * // STT transcribes the audio, result.transcription contains the transcription
+     * ```
+     */
+    stt?: STTOptions & {
+        provider?: string;
+        audio?: Buffer | ArrayBuffer;
+    };
     enableEvaluation?: boolean;
     enableAnalytics?: boolean;
     context?: Record<string, JsonValue>;
@@ -1033,6 +1078,8 @@ export type TextGenerationResult = {
     analytics?: AnalyticsData;
     evaluation?: EvaluationData;
     audio?: TTSResult;
+    /** STT transcription result (present when stt input was processed) */
+    transcription?: STTResult;
     /** Video generation result */
     video?: VideoGenerationResult;
     /** PowerPoint generation result */

package/dist/lib/types/hitl.d.ts CHANGED Viewed

@@ -232,3 +232,6 @@ export type HITLManager = {
     on(event: string, listener: (...args: unknown[]) => void): HITLManager;
     emit(event: string, ...args: unknown[]): boolean;
 };
+export type HITLExecutionState = {
+    triggered: boolean;
+};

package/dist/lib/types/index.d.ts CHANGED Viewed

@@ -48,7 +48,7 @@ export * from "./subscription.js";
 export * from "./task.js";
 export * from "./taskClassification.js";
 export * from "./tools.js";
-export * from "./tts.js";
+export * from "./voice.js";
 export * from "./universalProviderOptions.js";
 export * from "./utilities.js";
 export * from "./workflow.js";

package/dist/lib/types/index.js CHANGED Viewed

@@ -49,7 +49,7 @@ export * from "./subscription.js";
 export * from "./task.js";
 export * from "./taskClassification.js";
 export * from "./tools.js";
-export * from "./tts.js";
+export * from "./voice.js";
 export * from "./universalProviderOptions.js";
 export * from "./utilities.js";
 export * from "./workflow.js";

package/dist/lib/types/realtime.d.ts ADDED Viewed

@@ -0,0 +1,243 @@
+/**
+ * Realtime Voice Type Definitions for NeuroLink
+ *
+ * All realtime/bidirectional voice types: session, config, messages,
+ * event handlers, provider types, handler types, error codes, defaults,
+ * and type guards.
+ *
+ * @module types/realtime
+ */
+import type { TTSAudioFormat } from "./tts.js";
+type RealtimeProviderCapability = "tts" | "stt" | "realtime" | "streaming";
+/**
+ * Realtime session state
+ */
+export type RealtimeSessionState = "disconnected" | "connecting" | "connected" | "disconnecting" | "error";
+/**
+ * Realtime voice configuration
+ */
+export type RealtimeConfig = {
+    /**
+     * Provider to use. Must match the handler key registered with
+     * `RealtimeProcessor.registerHandler()` — currently `"openai-realtime"`
+     * (registered in `providerRegistry.ts`) and `"gemini-live"` (registered in
+     * `providerRegistry.ts`). Aliasing is handled at registry/CLI parse time,
+     * not here.
+     */
+    provider: "openai-realtime" | "gemini-live";
+    /** API key */
+    apiKey?: string;
+    /** Model to use */
+    model?: string;
+    /** Voice for TTS output */
+    voice?: string;
+    /** Input language */
+    inputLanguage?: string;
+    /** Output language */
+    outputLanguage?: string;
+    /** System prompt for the AI */
+    systemPrompt?: string;
+    /** Session timeout in milliseconds */
+    timeout?: number;
+    /** Audio input format */
+    inputFormat?: TTSAudioFormat;
+    /** Audio output format */
+    outputFormat?: TTSAudioFormat;
+    /** Input sample rate */
+    inputSampleRate?: number;
+    /** Output sample rate */
+    outputSampleRate?: number;
+    /** Enable voice activity detection */
+    vadEnabled?: boolean;
+    /** VAD threshold (0-1) */
+    vadThreshold?: number;
+    /** Turn detection mode */
+    turnDetection?: "server_vad" | "manual";
+    /** Instructions/system prompt for the session */
+    instructions?: string;
+    /** Temperature for AI responses */
+    temperature?: number;
+    /** Tools/functions available to the model */
+    tools?: RealtimeTool[];
+};
+/**
+ * Realtime tool definition
+ */
+export type RealtimeTool = {
+    /** Tool name */
+    name: string;
+    /** Tool description */
+    description: string;
+    /** JSON schema for parameters */
+    parameters: Record<string, unknown>;
+};
+/**
+ * Realtime session information
+ */
+export type RealtimeSession = {
+    /** Session ID */
+    id: string;
+    /** Current state */
+    state: RealtimeSessionState;
+    /** Provider name — narrowed to the validated config provider union so
+     *  session state stays aligned with what `connect()` accepts. */
+    provider: RealtimeConfig["provider"];
+    /** Model being used */
+    model?: string;
+    /** Session creation time */
+    createdAt: Date;
+    /** Last activity time */
+    lastActivityAt: Date;
+    /** Session configuration */
+    config: RealtimeConfig;
+    /** Check if session is open */
+    isOpen?: () => boolean;
+    /** Close the session */
+    close?: () => Promise<void>;
+};
+/**
+ * Realtime audio chunk
+ */
+export type RealtimeAudioChunk = {
+    /** Audio data */
+    data: Buffer;
+    /** Chunk sequence number */
+    index: number;
+    /** Whether this is the final chunk */
+    isFinal: boolean;
+    /** Audio format */
+    format: TTSAudioFormat;
+    /** Sample rate */
+    sampleRate?: number;
+    /** Duration of this chunk in milliseconds */
+    durationMs?: number;
+};
+/**
+ * Realtime message types
+ */
+export type RealtimeMessageType = "audio" | "text" | "transcript" | "function_call" | "function_result" | "error" | "session_update" | "turn_start" | "turn_end";
+/**
+ * Realtime message
+ */
+export type RealtimeMessage = {
+    /** Message type */
+    type: RealtimeMessageType;
+    /** Message ID */
+    id?: string;
+    /** Audio data (for audio messages) */
+    audio?: RealtimeAudioChunk;
+    /** Text content (for text/transcript messages) */
+    text?: string;
+    /** Whether this is a partial result */
+    isPartial?: boolean;
+    /** Function call data */
+    functionCall?: {
+        name: string;
+        arguments: Record<string, unknown>;
+    };
+    /** Function result data */
+    functionResult?: {
+        name: string;
+        result: unknown;
+    };
+    /** Error information */
+    error?: {
+        code: string;
+        message: string;
+    };
+    /** Timestamp */
+    timestamp: Date;
+};
+/**
+ * Realtime event handler callbacks
+ */
+export type RealtimeEventHandlers = {
+    /** Called when audio is received */
+    onAudio?: (chunk: RealtimeAudioChunk) => void;
+    /** Called when text/transcript is received */
+    onTranscript?: (text: string, isFinal: boolean) => void;
+    /** Called when the model generates text */
+    onText?: (text: string, isFinal: boolean) => void;
+    /** Called when a function call is requested */
+    onFunctionCall?: (name: string, args: Record<string, unknown>) => Promise<unknown>;
+    /** Called when session state changes */
+    onStateChange?: (state: RealtimeSessionState) => void;
+    /** Called when an error occurs */
+    onError?: (error: Error) => void;
+    /** Called when a turn starts */
+    onTurnStart?: () => void;
+    /** Called when a turn ends */
+    onTurnEnd?: () => void;
+};
+/**
+ * Realtime voice provider type (bidirectional audio)
+ */
+export type RealtimeVoiceProvider = {
+    /** Provider name identifier */
+    readonly name: string;
+    /** Get supported capabilities */
+    getCapabilities(): RealtimeProviderCapability[];
+    /** Check if provider is properly configured */
+    isConfigured(): boolean;
+    /** Validate provider configuration */
+    validateConfig(): Promise<{
+        valid: boolean;
+        errors: string[];
+    }>;
+    /** Get provider-specific options schema */
+    getOptionsSchema?(): Record<string, unknown>;
+    /**
+     * Create a new realtime session
+     */
+    connect(config: RealtimeConfig): Promise<RealtimeSession>;
+    /**
+     * Check if connected
+     */
+    isConnected(): boolean;
+    /**
+     * Disconnect from realtime session
+     */
+    disconnect(): Promise<void>;
+    /**
+     * Get current session configuration
+     */
+    getSessionConfig(): RealtimeConfig | null;
+};
+export type RealtimeHandler = {
+    readonly name: string;
+    connect(config: RealtimeConfig): Promise<RealtimeSession>;
+    disconnect(): Promise<void>;
+    isConnected(): boolean;
+    getSession(): RealtimeSession | null;
+    sendAudio(audio: Buffer | RealtimeAudioChunk): Promise<void>;
+    sendText?(text: string): Promise<void>;
+    triggerResponse?(): Promise<void>;
+    cancelResponse?(): Promise<void>;
+    on(handlers: RealtimeEventHandlers): void;
+    off(): void;
+    isConfigured(): boolean;
+    getSupportedFormats(): TTSAudioFormat[];
+};
+/**
+ * Realtime error codes
+ */
+export declare const REALTIME_ERROR_CODES: {
+    readonly CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED";
+    readonly SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT";
+    readonly PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR";
+    readonly AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR";
+    readonly PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED";
+    readonly PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED";
+    readonly SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE";
+    readonly SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE";
+    readonly INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE";
+};
+/**
+ * Default realtime configuration
+ */
+export declare const DEFAULT_REALTIME_CONFIG: Partial<RealtimeConfig>;
+/**
+ * Type guard for valid RealtimeConfig
+ */
+export declare function isValidRealtimeConfig(config: unknown): config is RealtimeConfig;
+export {};

package/dist/lib/types/realtime.js ADDED Viewed

@@ -0,0 +1,70 @@
+/**
+ * Realtime Voice Type Definitions for NeuroLink
+ *
+ * All realtime/bidirectional voice types: session, config, messages,
+ * event handlers, provider types, handler types, error codes, defaults,
+ * and type guards.
+ *
+ * @module types/realtime
+ */
+// ============================================================================
+// REALTIME ERROR CODES
+// ============================================================================
+/**
+ * Realtime error codes
+ */
+export const REALTIME_ERROR_CODES = {
+    CONNECTION_FAILED: "REALTIME_CONNECTION_FAILED",
+    SESSION_TIMEOUT: "REALTIME_SESSION_TIMEOUT",
+    PROTOCOL_ERROR: "REALTIME_PROTOCOL_ERROR",
+    AUDIO_STREAM_ERROR: "REALTIME_AUDIO_STREAM_ERROR",
+    PROVIDER_NOT_CONFIGURED: "REALTIME_PROVIDER_NOT_CONFIGURED",
+    PROVIDER_NOT_SUPPORTED: "REALTIME_PROVIDER_NOT_SUPPORTED",
+    SESSION_ALREADY_ACTIVE: "REALTIME_SESSION_ALREADY_ACTIVE",
+    SESSION_NOT_ACTIVE: "REALTIME_SESSION_NOT_ACTIVE",
+    INVALID_MESSAGE: "REALTIME_INVALID_MESSAGE",
+};
+// ============================================================================
+// REALTIME DEFAULTS
+// ============================================================================
+/**
+ * Default realtime configuration
+ */
+export const DEFAULT_REALTIME_CONFIG = {
+    timeout: 30000,
+    inputSampleRate: 24000,
+    outputSampleRate: 24000,
+    vadEnabled: true,
+    vadThreshold: 0.5,
+    turnDetection: "server_vad",
+};
+// ============================================================================
+// REALTIME TYPE GUARDS
+// ============================================================================
+/**
+ * Type guard for valid RealtimeConfig
+ */
+export function isValidRealtimeConfig(config) {
+    if (!config || typeof config !== "object") {
+        return false;
+    }
+    const conf = config;
+    if (!conf.provider ||
+        !["openai-realtime", "gemini-live"].includes(conf.provider)) {
+        return false;
+    }
+    if (conf.timeout !== undefined) {
+        if (typeof conf.timeout !== "number" || conf.timeout <= 0) {
+            return false;
+        }
+    }
+    if (conf.vadThreshold !== undefined) {
+        if (typeof conf.vadThreshold !== "number" ||
+            conf.vadThreshold < 0 ||
+            conf.vadThreshold > 1) {
+            return false;
+        }
+    }
+    return true;
+}
+//# sourceMappingURL=realtime.js.map

package/dist/lib/types/server.d.ts CHANGED Viewed

@@ -1119,3 +1119,71 @@ export type CobraInstance = {
     process: (pcm: Int16Array) => number;
     release: () => void;
 };
+/**
+ * Per-WebSocket-connection context object passed to the voice connection
+ * handler. Holds shared singletons that all per-connection state derives from.
+ *
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
+ */
+export type ServerVoiceConnectionCtx = {
+    neurolink: NeuroLink;
+    accessKey: string;
+};
+/**
+ * Per-session mutable state for one voice WebSocket connection.
+ *
+ * Threaded through the voice connection helper functions so each connection
+ * has fully isolated turn / TTS / VAD / barge-in state. The class types
+ * (`FrameBus`, `TurnManager`, `CartesiaStream`) are imported as types here so
+ * that this file remains the single source of truth — consumers import this
+ * type via the barrel and do not redefine it locally.
+ *
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type.)
+ */
+export type ServerVoiceSessionState = {
+    cobra: CobraInstance | null;
+    FRAME_LENGTH: number;
+    FRAME_BYTES: number;
+    bus: import("../server/voice/frameBus.js").FrameBus;
+    turnManager: import("../server/voice/turnManager.js").TurnManager;
+    sonioxWs: import("ws").WebSocket | null;
+    keepAliveTimer: NodeJS.Timeout | null;
+    sonioxReconnectTimer: ReturnType<typeof setTimeout> | null;
+    sessionClosed: boolean;
+    transcriptBuffer: string;
+    activeTTS: import("../adapters/tts/cartesiaHandler.js").CartesiaStream | null;
+    conversation: ConversationMessage[];
+    currentTurnId: number;
+    activePipelineTurnId: number | null;
+    turnAborters: Set<{
+        aborted: boolean;
+    }>;
+    playbackResetTimer: NodeJS.Timeout | null;
+    bargeInLockedUntil: number;
+    isSpeaking: boolean;
+    silenceFrameCount: number;
+    voiceFrameCount: number;
+    frameRemainder: Buffer;
+};
+/**
+ * Options accepted by `setupWebSocket()` in `server/voice/voiceWebSocketHandler.ts`.
+ *
+ * (Server-prefixed per CLAUDE.md Rule 9 — server-tier type. Lives in
+ * `server.ts` rather than `cli.ts` because it configures a server-side
+ * WebSocket upgrade handler, not CLI argument parsing.)
+ */
+export type ServerVoiceWebSocketOptions = {
+    /**
+     * Optional shared-secret bearer token. When set, the WebSocket upgrade
+     * handshake must include `Authorization: Bearer <token>` or
+     * `?token=<token>` in the URL. Without this, anyone reachable on the
+     * network can open a session and consume Soniox / Cartesia / LLM credits.
+     */
+    authToken?: string;
+    /**
+     * Maximum WebSocket message size in bytes. Defaults to 1 MiB. Caps both
+     * inbound audio frames and any client control messages — guards against
+     * OOM via oversized uploads.
+     */
+    maxPayload?: number;
+};

package/dist/lib/types/span.d.ts CHANGED Viewed

@@ -37,6 +37,8 @@ export declare enum SpanType {
     WORKFLOW = "workflow",
     /** TTS synthesis */
     TTS = "tts",
+    /** STT transcription */
+    STT = "stt",
     /** Server adapter request */
     SERVER_REQUEST = "server.request",
     /** Custom span */

package/dist/lib/types/span.js CHANGED Viewed

@@ -38,6 +38,8 @@ export var SpanType;
     SpanType["WORKFLOW"] = "workflow";
     /** TTS synthesis */
     SpanType["TTS"] = "tts";
+    /** STT transcription */
+    SpanType["STT"] = "stt";
     /** Server adapter request */
     SpanType["SERVER_REQUEST"] = "server.request";
     /** Custom span */

package/dist/lib/types/stream.d.ts CHANGED Viewed

@@ -11,7 +11,8 @@ import type { ChatMessage } from "./conversation.js";
 import type { StreamNoOutputSentinel } from "./noOutputSentinel.js";
 import type { AdditionalMemoryUser } from "./generate.js";
 import type { AIModelProviderConfig, NeurolinkCredentials } from "./providers.js";
-import type { TTSChunk, TTSOptions } from "./tts.js";
+import type { TTSChunk, TTSOptions, TTSResult } from "./tts.js";
+import type { STTOptions, STTResult } from "./stt.js";
 import type { StandardRecord, ValidationSchema } from "./aliases.js";
 import type { FileWithMetadata } from "./file.js";
 import type { WorkflowConfig } from "./workflow.js";
@@ -127,9 +128,9 @@ export type AudioChunk = {
  * ```typescript
  * const audioBuffer: Buffer[] = [];
  * for await (const chunk of result.stream) {
- *   if (chunk.type === "audio") {
- *     audioBuffer.push(chunk.audioChunk.data); // TypeScript knows 'audioChunk' exists
- *     if (chunk.audioChunk.isFinal) {
+ *   if (chunk.type === "tts_audio") {
+ *     audioBuffer.push(chunk.audio.data); // TypeScript knows 'audio' exists
+ *     if (chunk.audio.isFinal) {
  *       const fullAudio = Buffer.concat(audioBuffer);
  *       fs.writeFileSync('output.mp3', fullAudio);
  *     }
@@ -144,8 +145,8 @@ export type AudioChunk = {
  *     case "text":
  *       process.stdout.write(chunk.content);
  *       break;
- *     case "audio":
- *       playAudioChunk(chunk.audioChunk.data);
+ *     case "tts_audio":
+ *       playAudioChunk(chunk.audio.data);
  *       break;
  *   }
  * }
@@ -157,10 +158,12 @@ export type StreamChunk = {
     /** Text content chunk */
     content: string;
 } | {
-    /** Discriminator for audio chunks */
-    type: "audio";
+    /** Discriminator for synthesized TTS audio chunks. Uses `tts_audio`
+     *  (not `audio`) to avoid colliding with realtime AudioChunk and to
+     *  match the runtime shape emitted by `BaseProvider.stream()`. */
+    type: "tts_audio";
     /** TTS audio chunk data */
-    audioChunk: TTSChunk;
+    audio: TTSChunk;
 };
 export type StreamOptions = {
     input: {
@@ -226,9 +229,9 @@ export type StreamOptions = {
      * for await (const chunk of result.stream) {
      *   if (chunk.type === "text") {
      *     process.stdout.write(chunk.content);
-     *   } else if (chunk.type === "audio") {
+     *   } else if (chunk.type === "tts_audio") {
      *     // Handle audio chunk
-     *     playAudioChunk(chunk.audioChunk.data);
+     *     playAudioChunk(chunk.audio.data);
      *   }
      * }
      * ```
@@ -249,6 +252,15 @@ export type StreamOptions = {
      * ```
      */
     tts?: TTSOptions;
+    /**
+     * Speech-to-Text (STT) configuration for streaming
+     *
+     * When enabled, audio from `stt.audio` is transcribed before streaming begins.
+     */
+    stt?: STTOptions & {
+        provider?: string;
+        audio?: Buffer | ArrayBuffer;
+    };
     /**
      * Thinking/reasoning configuration for extended thinking models
      *
@@ -491,6 +503,9 @@ export type StreamResult = {
     } | StreamNoOutputSentinel | {
         type: "audio";
         audio: AudioChunk;
+    } | {
+        type: "tts_audio";
+        audio: TTSChunk;
     } | {
         type: "image";
         imageOutput: {
@@ -499,9 +514,6 @@ export type StreamResult = {
     } | {
         content: string;
         type?: "preliminary" | "final";
-    } | {
-        type: "audio";
-        audio: AudioChunk;
     }>;
     provider?: string;
     model?: string;
@@ -566,6 +578,16 @@ export type StreamResult = {
         workflowId: string;
         workflowName: string;
     };
+    /** STT transcription result (when stt option is used) */
+    transcription?: STTResult;
+    /**
+     * TTS Mode 2 result (when `tts.enabled && tts.useAiResponse`).
+     * Resolves with the synthesized audio after the stream completes;
+     * resolves to undefined if TTS was not enabled or synthesis failed.
+     * The same audio is also yielded as a final chunk on `stream` for callers
+     * that prefer to consume it inline.
+     */
+    audio?: Promise<TTSResult | undefined>;
 };
 /**
  * Enhanced provider type with stream method