npm - @volley/recognition-client-sdk - Versions diffs - 0.1.296 → 0.1.381 - Mend

@volley/recognition-client-sdk 0.1.296 → 0.1.381

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/dist/browser.bundled.d.ts +1163 -0
package/dist/index.bundled.d.ts +2342 -0
package/dist/index.d.ts +1 -1
package/dist/index.d.ts.map +1 -1
package/dist/index.js +146 -90
package/dist/index.js.map +4 -4
package/dist/recog-client-sdk.browser.js +85 -81
package/dist/recog-client-sdk.browser.js.map +4 -4
package/dist/recognition-client.d.ts +4 -0
package/dist/recognition-client.d.ts.map +1 -1
package/dist/simplified-vgf-recognition-client.d.ts +2 -0
package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
package/dist/vgf-recognition-state.d.ts +9 -0
package/dist/vgf-recognition-state.d.ts.map +1 -1
package/package.json +27 -25
package/src/index.ts +2 -0
package/src/recognition-client.ts +54 -2
package/src/simplified-vgf-recognition-client.spec.ts +302 -17
package/src/simplified-vgf-recognition-client.ts +68 -12
package/src/vgf-recognition-state.ts +12 -0

package/dist/browser.bundled.d.ts ADDED Viewed

@@ -0,0 +1,1163 @@
+import { z } from 'zod';
+/**
+ * Provider types and enums for recognition services
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
+ */
+/**
+ * Supported speech recognition providers
+ */
+declare enum RecognitionProvider {
+    ASSEMBLYAI = "assemblyai",
+    DEEPGRAM = "deepgram",
+    ELEVENLABS = "elevenlabs",
+    FIREWORKS = "fireworks",
+    GOOGLE = "google",
+    GEMINI_BATCH = "gemini-batch",
+    OPENAI_BATCH = "openai-batch",
+    OPENAI_REALTIME = "openai-realtime"
+}
+/**
+ * ASR API type - distinguishes between streaming and file-based transcription APIs
+ * - STREAMING: Real-time streaming APIs (Deepgram, AssemblyAI, Google)
+ * - FILE_BASED: File upload/batch APIs (OpenAI Batch, Gemini Batch)
+ */
+declare enum ASRApiType {
+    STREAMING = "streaming",
+    FILE_BASED = "file-based"
+}
+/**
+ * Deepgram model names
+ */
+declare enum DeepgramModel {
+    NOVA_2 = "nova-2",
+    NOVA_3 = "nova-3",
+    FLUX_GENERAL_EN = "flux-general-en"
+}
+/**
+ * Google Cloud Speech models
+ * @see https://cloud.google.com/speech-to-text/docs/transcription-model
+ * @see https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model
+ */
+declare enum GoogleModel {
+    CHIRP_3 = "chirp_3",
+    CHIRP_2 = "chirp_2",
+    CHIRP = "chirp",
+    LATEST_LONG = "latest_long",
+    LATEST_SHORT = "latest_short",
+    TELEPHONY = "telephony",
+    TELEPHONY_SHORT = "telephony_short",
+    DEFAULT = "default",
+    COMMAND_AND_SEARCH = "command_and_search",
+    PHONE_CALL = "phone_call",
+    VIDEO = "video"
+}
+/**
+ * Fireworks AI models for ASR
+ * @see https://docs.fireworks.ai/guides/querying-asr-models
+ * @see https://fireworks.ai/models/fireworks/fireworks-asr-large
+ */
+declare enum FireworksModel {
+    ASR_V1 = "fireworks-asr-large",
+    ASR_V2 = "fireworks-asr-v2",
+    WHISPER_V3 = "whisper-v3",
+    WHISPER_V3_TURBO = "whisper-v3-turbo"
+}
+/**
+ * ElevenLabs Scribe models for speech-to-text
+ * @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime
+ * @see https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming
+ * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert
+ */
+declare enum ElevenLabsModel {
+    SCRIBE_V2_REALTIME = "scribe_v2_realtime",
+    SCRIBE_V1 = "scribe_v1"
+}
+/**
+ * OpenAI Realtime API transcription models
+ * These are the verified `input_audio_transcription.model` values.
+ * @see https://platform.openai.com/docs/guides/realtime
+ */
+declare enum OpenAIRealtimeModel {
+    GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
+}
+/**
+ * Type alias for any model from any provider
+ */
+type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
+/**
+ * Audio encoding types
+ */
+declare enum AudioEncoding {
+    ENCODING_UNSPECIFIED = 0,
+    LINEAR16 = 1,
+    OGG_OPUS = 2,
+    FLAC = 3,
+    MULAW = 4,
+    ALAW = 5
+}
+declare namespace AudioEncoding {
+    /**
+     * Convert numeric ID to AudioEncoding enum
+     * @param id - Numeric encoding identifier (0-5)
+     * @returns AudioEncoding enum value or undefined if invalid
+     */
+    function fromId(id: number): AudioEncoding | undefined;
+    /**
+     * Convert string name to AudioEncoding enum
+     * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
+     * @returns AudioEncoding enum value or undefined if invalid
+     */
+    function fromName(nameStr: string): AudioEncoding | undefined;
+    /**
+     * Convert AudioEncoding enum to numeric ID
+     * @param encoding - AudioEncoding enum value
+     * @returns Numeric ID (0-5)
+     */
+    function toId(encoding: AudioEncoding): number;
+    /**
+     * Convert AudioEncoding enum to string name
+     * @param encoding - AudioEncoding enum value
+     * @returns String name like "LINEAR16", "MULAW", etc.
+     */
+    function toName(encoding: AudioEncoding): string;
+    /**
+     * Check if a numeric ID is a valid encoding
+     * @param id - Numeric identifier to validate
+     * @returns true if valid encoding ID
+     */
+    function isIdValid(id: number): boolean;
+    /**
+     * Check if a string name is a valid encoding
+     * @param nameStr - String name to validate
+     * @returns true if valid encoding name
+     */
+    function isNameValid(nameStr: string): boolean;
+}
+/**
+ * Common sample rates (in Hz)
+ */
+declare enum SampleRate {
+    RATE_8000 = 8000,
+    RATE_16000 = 16000,
+    RATE_22050 = 22050,
+    RATE_24000 = 24000,
+    RATE_32000 = 32000,
+    RATE_44100 = 44100,
+    RATE_48000 = 48000
+}
+declare namespace SampleRate {
+    /**
+     * Convert Hz value to SampleRate enum
+     * @param hz - Sample rate in Hz (8000, 16000, etc.)
+     * @returns SampleRate enum value or undefined if invalid
+     */
+    function fromHz(hz: number): SampleRate | undefined;
+    /**
+     * Convert string name to SampleRate enum
+     * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
+     * @returns SampleRate enum value or undefined if invalid
+     */
+    function fromName(nameStr: string): SampleRate | undefined;
+    /**
+     * Convert SampleRate enum to Hz value
+     * @param rate - SampleRate enum value
+     * @returns Hz value (8000, 16000, etc.)
+     */
+    function toHz(rate: SampleRate): number;
+    /**
+     * Convert SampleRate enum to string name
+     * @param rate - SampleRate enum value
+     * @returns String name like "RATE_8000", "RATE_16000", etc.
+     */
+    function toName(rate: SampleRate): string;
+    /**
+     * Check if a numeric Hz value is a valid sample rate
+     * @param hz - Hz value to validate
+     * @returns true if valid sample rate
+     */
+    function isHzValid(hz: number): boolean;
+    /**
+     * Check if a string name is a valid sample rate
+     * @param nameStr - String name to validate
+     * @returns true if valid sample rate name
+     */
+    function isNameValid(nameStr: string): boolean;
+}
+/**
+ * Supported languages for recognition
+ * Using BCP-47 language tags
+ */
+declare enum Language {
+    ENGLISH_US = "en-US",
+    ENGLISH_GB = "en-GB",
+    SPANISH_ES = "es-ES",
+    SPANISH_MX = "es-MX",
+    FRENCH_FR = "fr-FR",
+    GERMAN_DE = "de-DE",
+    ITALIAN_IT = "it-IT",
+    PORTUGUESE_BR = "pt-BR",
+    JAPANESE_JP = "ja-JP",
+    KOREAN_KR = "ko-KR",
+    CHINESE_CN = "zh-CN",
+    CHINESE_TW = "zh-TW"
+}
+/**
+ * Recognition Result Types V1
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
+ * Types and schemas for recognition results sent to SDK clients
+ */
+/**
+ * Message type discriminator for recognition results V1
+ */
+declare enum RecognitionResultTypeV1 {
+    TRANSCRIPTION = "Transcription",
+    FUNCTION_CALL = "FunctionCall",
+    METADATA = "Metadata",
+    ERROR = "Error",
+    CLIENT_CONTROL_MESSAGE = "ClientControlMessage"
+}
+/**
+ * Transcription result V1 - contains transcript message
+ * In the long run game side should not need to know it. In the short run it is send back to client.
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
+ */
+declare const TranscriptionResultSchemaV1: z.ZodObject<{
+    type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
+    audioUtteranceId: z.ZodString;
+    finalTranscript: z.ZodString;
+    finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
+    pendingTranscript: z.ZodOptional<z.ZodString>;
+    pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
+    is_finished: z.ZodBoolean;
+    voiceStart: z.ZodOptional<z.ZodNumber>;
+    voiceDuration: z.ZodOptional<z.ZodNumber>;
+    voiceEnd: z.ZodOptional<z.ZodNumber>;
+    startTimestamp: z.ZodOptional<z.ZodNumber>;
+    endTimestamp: z.ZodOptional<z.ZodNumber>;
+    receivedAtMs: z.ZodOptional<z.ZodNumber>;
+    accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
+}, "strip", z.ZodTypeAny, {
+    type: RecognitionResultTypeV1.TRANSCRIPTION;
+    audioUtteranceId: string;
+    finalTranscript: string;
+    is_finished: boolean;
+    finalTranscriptConfidence?: number | undefined;
+    pendingTranscript?: string | undefined;
+    pendingTranscriptConfidence?: number | undefined;
+    voiceStart?: number | undefined;
+    voiceDuration?: number | undefined;
+    voiceEnd?: number | undefined;
+    startTimestamp?: number | undefined;
+    endTimestamp?: number | undefined;
+    receivedAtMs?: number | undefined;
+    accumulatedAudioTimeMs?: number | undefined;
+}, {
+    type: RecognitionResultTypeV1.TRANSCRIPTION;
+    audioUtteranceId: string;
+    finalTranscript: string;
+    is_finished: boolean;
+    finalTranscriptConfidence?: number | undefined;
+    pendingTranscript?: string | undefined;
+    pendingTranscriptConfidence?: number | undefined;
+    voiceStart?: number | undefined;
+    voiceDuration?: number | undefined;
+    voiceEnd?: number | undefined;
+    startTimestamp?: number | undefined;
+    endTimestamp?: number | undefined;
+    receivedAtMs?: number | undefined;
+    accumulatedAudioTimeMs?: number | undefined;
+}>;
+type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
+/**
+ * Function call result V1 - similar to LLM function call
+ * In the long run game server should know it, rather than TV or client.
+ */
+declare const FunctionCallResultSchemaV1: z.ZodObject<{
+    type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>;
+    audioUtteranceId: z.ZodString;
+    functionName: z.ZodString;
+    functionArgJson: z.ZodString;
+}, "strip", z.ZodTypeAny, {
+    type: RecognitionResultTypeV1.FUNCTION_CALL;
+    audioUtteranceId: string;
+    functionName: string;
+    functionArgJson: string;
+}, {
+    type: RecognitionResultTypeV1.FUNCTION_CALL;
+    audioUtteranceId: string;
+    functionName: string;
+    functionArgJson: string;
+}>;
+type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
+/**
+ * Metadata result V1 - contains metadata, timing information, and ASR config
+ * Sent when the provider connection closes to provide final timing metrics and config
+ * In the long run game server should know it, rather than TV or client.
+ */
+declare const MetadataResultSchemaV1: z.ZodObject<{
+    type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
+    audioUtteranceId: z.ZodString;
+    recordingStartMs: z.ZodOptional<z.ZodNumber>;
+    recordingEndMs: z.ZodOptional<z.ZodNumber>;
+    transcriptEndMs: z.ZodOptional<z.ZodNumber>;
+    socketCloseAtMs: z.ZodOptional<z.ZodNumber>;
+    duration: z.ZodOptional<z.ZodNumber>;
+    volume: z.ZodOptional<z.ZodNumber>;
+    accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
+    costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
+    apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
+    asrConfig: z.ZodOptional<z.ZodString>;
+    rawAsrMetadata: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    type: RecognitionResultTypeV1.METADATA;
+    audioUtteranceId: string;
+    recordingStartMs?: number | undefined;
+    recordingEndMs?: number | undefined;
+    transcriptEndMs?: number | undefined;
+    socketCloseAtMs?: number | undefined;
+    duration?: number | undefined;
+    volume?: number | undefined;
+    accumulatedAudioTimeMs?: number | undefined;
+    costInUSD?: number | undefined;
+    apiType?: ASRApiType | undefined;
+    asrConfig?: string | undefined;
+    rawAsrMetadata?: string | undefined;
+}, {
+    type: RecognitionResultTypeV1.METADATA;
+    audioUtteranceId: string;
+    recordingStartMs?: number | undefined;
+    recordingEndMs?: number | undefined;
+    transcriptEndMs?: number | undefined;
+    socketCloseAtMs?: number | undefined;
+    duration?: number | undefined;
+    volume?: number | undefined;
+    accumulatedAudioTimeMs?: number | undefined;
+    costInUSD?: number | undefined;
+    apiType?: ASRApiType | undefined;
+    asrConfig?: string | undefined;
+    rawAsrMetadata?: string | undefined;
+}>;
+type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
+/**
+ * Error type enum V1 - categorizes different types of errors
+ */
+declare enum ErrorTypeV1 {
+    AUTHENTICATION_ERROR = "authentication_error",
+    VALIDATION_ERROR = "validation_error",
+    PROVIDER_ERROR = "provider_error",
+    TIMEOUT_ERROR = "timeout_error",
+    QUOTA_EXCEEDED = "quota_exceeded",
+    CONNECTION_ERROR = "connection_error",
+    UNKNOWN_ERROR = "unknown_error"
+}
+/**
+ * Error result V1 - contains error message
+ * In the long run game server should know it, rather than TV or client.
+ */
+declare const ErrorResultSchemaV1: z.ZodObject<{
+    type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>;
+    audioUtteranceId: z.ZodString;
+    errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>;
+    message: z.ZodOptional<z.ZodString>;
+    code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
+    description: z.ZodOptional<z.ZodString>;
+}, "strip", z.ZodTypeAny, {
+    type: RecognitionResultTypeV1.ERROR;
+    audioUtteranceId: string;
+    errorType?: ErrorTypeV1 | undefined;
+    message?: string | undefined;
+    code?: string | number | undefined;
+    description?: string | undefined;
+}, {
+    type: RecognitionResultTypeV1.ERROR;
+    audioUtteranceId: string;
+    errorType?: ErrorTypeV1 | undefined;
+    message?: string | undefined;
+    code?: string | number | undefined;
+    description?: string | undefined;
+}>;
+type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>;
+/**
+ * Recognition Context Types V1
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
+ * Types and schemas for recognition context data
+ */
+/**
+ * Message type discriminator for recognition context V1
+ */
+declare enum RecognitionContextTypeV1 {
+    GAME_CONTEXT = "GameContext",
+    CONTROL_SIGNAL = "ControlSignal",
+    ASR_REQUEST = "ASRRequest"
+}
+/**
+ * Control signal types for recognition V1
+ */
+declare enum ControlSignalTypeV1 {
+    START_RECORDING = "start_recording",
+    STOP_RECORDING = "stop_recording"
+}
+/**
+ * Game context V1 - contains game state information
+ */
+declare const GameContextSchemaV1: z.ZodObject<{
+    type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>;
+    gameId: z.ZodString;
+    gamePhase: z.ZodString;
+    promptSTT: z.ZodOptional<z.ZodString>;
+    promptSTF: z.ZodOptional<z.ZodString>;
+    promptTTF: z.ZodOptional<z.ZodString>;
+    slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
+}, "strip", z.ZodTypeAny, {
+    type: RecognitionContextTypeV1.GAME_CONTEXT;
+    gameId: string;
+    gamePhase: string;
+    promptSTT?: string | undefined;
+    promptSTF?: string | undefined;
+    promptTTF?: string | undefined;
+    slotMap?: Record<string, string[]> | undefined;
+}, {
+    type: RecognitionContextTypeV1.GAME_CONTEXT;
+    gameId: string;
+    gamePhase: string;
+    promptSTT?: string | undefined;
+    promptSTF?: string | undefined;
+    promptTTF?: string | undefined;
+    slotMap?: Record<string, string[]> | undefined;
+}>;
+type GameContextV1 = z.infer<typeof GameContextSchemaV1>;
+/**
+ * Unified ASR Request Configuration
+ *
+ * Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests.
+ * This interface provides a consistent API for clients regardless of the underlying provider.
+ *
+ * All fields use library-defined enums for type safety and consistency.
+ * Provider-specific mappers will convert these to provider-native formats.
+ */
+/**
+ * Final transcript stability modes
+ *
+ * Controls timeout duration for fallback final transcript after stopRecording().
+ * Similar to AssemblyAI's turn detection confidence modes but applied to our
+ * internal timeout mechanism when vendors don't respond with is_final=true.
+ *
+ * @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection
+ */
+declare enum FinalTranscriptStability {
+    /**
+     * Aggressive mode: 100ms timeout
+     * Fast response, optimized for short utterances and quick back-and-forth
+     * Use cases: IVR, quick commands, retail confirmations
+     */
+    AGGRESSIVE = "aggressive",
+    /**
+     * Balanced mode: 200ms timeout (default)
+     * Natural middle ground for most conversational scenarios
+     * Use cases: General customer support, tech support, typical voice interactions
+     */
+    BALANCED = "balanced",
+    /**
+     * Conservative mode: 400ms timeout
+     * Wait longer for providers, optimized for complex/reflective speech
+     * Use cases: Healthcare, complex queries, careful thought processes
+     */
+    CONSERVATIVE = "conservative",
+    /**
+     * Experimental mode: 10000ms (10 seconds) timeout
+     * Very long wait for batch/async providers that need significant processing time
+     * Use cases: Batch processing (Gemini, OpenAI Whisper), complex audio analysis
+     * Note: Should be cancelled immediately when transcript is received
+     */
+    EXPERIMENTAL = "experimental"
+}
+/**
+ * Unified ASR request configuration
+ *
+ * This configuration is used by:
+ * - Client SDKs to specify recognition parameters
+ * - Demo applications for user input
+ * - Service layer to configure provider sessions
+ *
+ * Core fields only - all provider-specific options go in providerOptions
+ *
+ * @example
+ * ```typescript
+ * const config: ASRRequestConfig = {
+ *   provider: RecognitionProvider.GOOGLE,
+ *   model: GoogleModel.LATEST_LONG,
+ *   language: Language.ENGLISH_US,
+ *   sampleRate: SampleRate.RATE_16000, // or just 16000
+ *   encoding: AudioEncoding.LINEAR16,
+ *   providerOptions: {
+ *     google: {
+ *       enableAutomaticPunctuation: true,
+ *       interimResults: true,
+ *       singleUtterance: false
+ *     }
+ *   }
+ * };
+ * ```
+ */
+interface ASRRequestConfig {
+    /**
+     * The ASR provider to use
+     * Must be one of the supported providers in RecognitionProvider enum
+     */
+    provider: RecognitionProvider | string;
+    /**
+     * Optional model specification for the provider
+     * Can be provider-specific model enum or string
+     * If not specified, provider's default model will be used
+     */
+    model?: RecognitionModel;
+    /**
+     * Language/locale for recognition
+     * Use Language enum for common languages
+     * Can also accept BCP-47 language tags as strings
+     */
+    language: Language | string;
+    /**
+     * Audio sample rate in Hz
+     * Prefer using SampleRate enum values for standard rates
+     * Can also accept numeric Hz values (e.g., 16000)
+     */
+    sampleRate: SampleRate | number;
+    /**
+     * Audio encoding format
+     * Must match the actual audio data being sent
+     * Use AudioEncoding enum for standard formats
+     */
+    encoding: AudioEncoding | string;
+    /**
+     * Enable interim (partial) results during recognition
+     * When true, receive real-time updates before finalization
+     * When false, only receive final results
+     * Default: false
+     */
+    interimResults?: boolean;
+    /**
+     * Require GameContext before starting recognition such as song titles
+     * When true, server waits for GameContext message before processing audio
+     * When false, recognition starts immediately
+     * Default: false
+     */
+    useContext?: boolean;
+    /**
+     * Final transcript stability mode
+     *
+     * Controls timeout duration for fallback final transcript when provider
+     * doesn't respond with is_final=true after stopRecording().
+     *
+     * - aggressive: 100ms - fast response, may cut off slow providers
+     * - balanced: 200ms - current default, good for most cases
+     * - conservative: 400ms - wait longer for complex utterances
+     *
+     * @default 'balanced'
+     * @see FinalTranscriptStability enum for detailed descriptions
+     */
+    finalTranscriptStability?: FinalTranscriptStability | string;
+    /**
+     * Additional provider-specific options
+     *
+     * Common options per provider:
+     * - Deepgram: punctuate, smart_format, diarize, utterances
+     * - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets
+     * - AssemblyAI: formatTurns, filter_profanity, word_boost
+     *
+     * Note: interimResults is now a top-level field, but can still be overridden per provider
+     *
+     * @example
+     * ```typescript
+     * providerOptions: {
+     *   google: {
+     *     enableAutomaticPunctuation: true,
+     *     singleUtterance: false,
+     *     enableWordTimeOffsets: false
+     *   }
+     * }
+     * ```
+     */
+    providerOptions?: Record<string, any>;
+    /**
+     * Optional fallback ASR configurations
+     *
+     * List of alternative ASR configurations to use if the primary fails.
+     * Each fallback config is a complete ASRRequestConfig that will be tried
+     * in order until one succeeds.
+     *
+     * @example
+     * ```typescript
+     * fallbackModels: [
+     *   {
+     *     provider: RecognitionProvider.DEEPGRAM,
+     *     model: DeepgramModel.NOVA_2,
+     *     language: Language.ENGLISH_US,
+     *     sampleRate: 16000,
+     *     encoding: AudioEncoding.LINEAR16
+     *   },
+     *   {
+     *     provider: RecognitionProvider.GOOGLE,
+     *     model: GoogleModel.LATEST_SHORT,
+     *     language: Language.ENGLISH_US,
+     *     sampleRate: 16000,
+     *     encoding: AudioEncoding.LINEAR16
+     *   }
+     * ]
+     * ```
+     */
+    fallbackModels?: ASRRequestConfig[];
+}
+/**
+ * Standard stage/environment constants used across all services
+ */
+declare const STAGES: {
+    readonly LOCAL: "local";
+    readonly DEV: "dev";
+    readonly STAGING: "staging";
+    readonly PRODUCTION: "production";
+};
+type Stage = typeof STAGES[keyof typeof STAGES];
+/**
+ * Generic WebSocket protocol types and utilities
+ * Supports flexible versioning and message types
+ * Used by both client and server implementations
+ */
+/**
+ * Base message structure - completely flexible
+ * @template V - Version type (number, string, etc.)
+ */
+interface Message<V = number> {
+    v: V;
+    type: string;
+    data?: unknown;
+}
+/**
+ * Version serializer interface
+ * Converts between version type V and byte representation
+ */
+interface VersionSerializer<V> {
+    serialize: (v: V) => number;
+    deserialize: (byte: number) => V;
+}
+/**
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
+ * Sends audio and control messages, receives responses from server
+ *
+ * Features:
+ * - Generic version type support (number, string, etc.)
+ * - Type-safe upward/downward message data
+ * - Client-side backpressure monitoring
+ * - Abstract hooks for application-specific logic
+ * - Format-agnostic audio protocol (supports any encoding)
+ */
+type ClientConfig = {
+    url: string;
+    highWM?: number;
+    lowWM?: number;
+};
+/**
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
+ * that send audio frames and JSON messages
+ *
+ * @template V - Version type (number, string, object, etc.)
+ * @template TUpward - Type of upward message data (Client -> Server)
+ * @template TDownward - Type of downward message data (Server -> Client)
+ *
+ * @example
+ * ```typescript
+ * class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> {
+ *   protected onConnected() {
+ *     console.log('Connected!');
+ *   }
+ *
+ *   protected onMessage(msg) {
+ *     console.log('Received:', msg.type, msg.data);
+ *   }
+ *
+ *   protected onDisconnected(code, reason) {
+ *     console.log('Disconnected:', code, reason);
+ *   }
+ *
+ *   protected onError(error) {
+ *     console.error('Error:', error);
+ *   }
+ * }
+ *
+ * const client = new MyClient({ url: 'ws://localhost:8080' });
+ * client.connect();
+ * client.sendMessage(1, 'configure', { language: 'en' });
+ * client.sendAudio(audioData);
+ * ```
+ */
+declare abstract class WebSocketAudioClient<V = number, // Version type (default: number)
+TUpward = unknown, // Upward message data type
+TDownward = unknown> {
+    private cfg;
+    protected versionSerializer: VersionSerializer<V>;
+    private ws;
+    private seq;
+    private HWM;
+    private LWM;
+    constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>);
+    /**
+     * Hook: Called when WebSocket connection is established
+     */
+    protected abstract onConnected(): void;
+    /**
+     * Hook: Called when WebSocket connection closes
+     * @param code - Close code (see WebSocketCloseCode enum)
+     * @param reason - Human-readable close reason
+     */
+    protected abstract onDisconnected(code: number, reason: string): void;
+    /**
+     * Hook: Called when WebSocket error occurs
+     */
+    protected abstract onError(error: Event): void;
+    /**
+     * Hook: Called when downward message arrives from server
+     * Override this to handle messages (optional - default does nothing)
+     */
+    protected onMessage(_msg: Message<V> & {
+        data: TDownward;
+    }): void;
+    connect(): void;
+    /**
+     * Send JSON message to server
+     * @param version - Message version
+     * @param type - Message type (developer defined)
+     * @param data - Message payload (typed)
+     */
+    sendMessage(version: V, type: string, data: TUpward): void;
+    /**
+     * Send audio frame with specified encoding and sample rate
+     * @param audioData - Audio data (any format: Int16Array, Uint8Array, ArrayBuffer, etc.)
+     * @param version - Audio frame version
+     * @param encodingId - Audio encoding ID (0-5, e.g., AudioEncoding.LINEAR16)
+     * @param sampleRate - Sample rate in Hz (e.g., 16000)
+     */
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView, version: V, encodingId: number, sampleRate: number): void;
+    /**
+     * Get current WebSocket buffer size
+     */
+    getBufferedAmount(): number;
+    /**
+     * Check if local buffer is backpressured
+     */
+    isLocalBackpressured(): boolean;
+    /**
+     * Check if ready to send audio
+     * Verifies: connection open, no local buffer pressure
+     */
+    canSend(): boolean;
+    /**
+     * Check if connection is open
+     */
+    isOpen(): boolean;
+    /**
+     * Get current connection state
+     */
+    getReadyState(): number;
+    /**
+     * Close the WebSocket connection
+     * Protected method for subclasses to implement disconnect logic
+     * @param code - WebSocket close code (default: 1000 = normal closure)
+     * @param reason - Human-readable close reason
+     */
+    protected closeConnection(code?: number, reason?: string): void;
+}
+/**
+ * Recognition Client Types
+ *
+ * Type definitions and interfaces for the recognition client SDK.
+ * These interfaces enable dependency injection, testing, and alternative implementations.
+ */
+/**
+ * Client connection state enum
+ * Represents the various states a recognition client can be in during its lifecycle
+ */
+declare enum ClientState {
+    /** Initial state, no connection established */
+    INITIAL = "initial",
+    /** Actively establishing WebSocket connection */
+    CONNECTING = "connecting",
+    /** WebSocket connected but waiting for server ready signal */
+    CONNECTED = "connected",
+    /** Server ready, can send audio */
+    READY = "ready",
+    /** Sent stop signal, waiting for final transcript */
+    STOPPING = "stopping",
+    /** Connection closed normally after stop */
+    STOPPED = "stopped",
+    /** Connection failed or lost unexpectedly */
+    FAILED = "failed"
+}
+/**
+ * Callback URL configuration with message type filtering
+ */
+interface RecognitionCallbackUrl {
+    /** The callback URL endpoint */
+    url: string;
+    /** Array of message types to send to this URL. If empty/undefined, all types are sent */
+    messageTypes?: Array<string | number>;
+}
+interface IRecognitionClientConfig {
+    /**
+     * WebSocket endpoint URL (optional)
+     * Either `url` or `stage` must be provided.
+     * If both are provided, `url` takes precedence.
+     *
+     * Example with explicit URL:
+     * ```typescript
+     * { url: 'wss://custom-endpoint.example.com/ws/v1/recognize' }
+     * ```
+     */
+    url?: string;
+    /**
+     * Stage for recognition service (recommended)
+     * Either `url` or `stage` must be provided.
+     * If both are provided, `url` takes precedence.
+     * Defaults to production if neither is provided.
+     *
+     * Example with STAGES enum (recommended):
+     * ```typescript
+     * import { STAGES } from '@recog/shared-types';
+     * { stage: STAGES.STAGING }
+     * ```
+     *
+     * String values also accepted:
+     * ```typescript
+     * { stage: 'staging' }  // STAGES.LOCAL | STAGES.DEV | STAGES.STAGING | STAGES.PRODUCTION
+     * ```
+     */
+    stage?: Stage | string;
+    /** ASR configuration (provider, model, language, etc.) - optional */
+    asrRequestConfig?: ASRRequestConfig;
+    /** Game context for improved recognition accuracy */
+    gameContext?: GameContextV1;
+    /** Audio utterance ID (optional) - if not provided, a UUID v4 will be generated */
+    audioUtteranceId?: string;
+    /** Callback URLs for server-side notifications with optional message type filtering (optional)
+     *  Game side only need to use it if another service need to be notified about the transcription results.
+     */
+    callbackUrls?: RecognitionCallbackUrl[];
+    /** User identification (optional) */
+    userId?: string;
+    /** Game session identification (optional). called 'sessionId' in Platform and most games. */
+    gameSessionId?: string;
+    /** Device identification (optional) */
+    deviceId?: string;
+    /** Account identification (optional) */
+    accountId?: string;
+    /** Question answer identifier for tracking Q&A sessions (optional and tracking purpose only) */
+    questionAnswerId?: string;
+    /** Platform for audio recording device (optional, e.g., 'ios', 'android', 'web', 'unity') */
+    platform?: string;
+    /** Callback when transcript is received */
+    onTranscript?: (result: TranscriptionResultV1) => void;
+    /**
+     * Callback when function call is received
+     * Note: Not supported in 2025. P2 feature for future speech-to-function-call capability.
+     */
+    onFunctionCall?: (result: FunctionCallResultV1) => void;
+    /** Callback when metadata is received. Only once after transcription is complete.*/
+    onMetadata?: (metadata: MetadataResultV1) => void;
+    /** Callback when error occurs */
+    onError?: (error: ErrorResultV1) => void;
+    /** Callback when connected to WebSocket */
+    onConnected?: () => void;
+    /**
+     * Callback when WebSocket disconnects
+     * @param code - WebSocket close code (1000 = normal, 1006 = abnormal, etc.)
+     * @param reason - Close reason string
+     */
+    onDisconnected?: (code: number, reason: string) => void;
+    /** High water mark for backpressure control (bytes) */
+    highWaterMark?: number;
+    /** Low water mark for backpressure control (bytes) */
+    lowWaterMark?: number;
+    /** Maximum buffer duration in seconds (default: 60s) */
+    maxBufferDurationSec?: number;
+    /** Expected chunks per second for ring buffer sizing (default: 100) */
+    chunksPerSecond?: number;
+    /**
+     * Connection retry configuration (optional)
+     * Only applies to initial connection establishment, not mid-stream interruptions.
+     *
+     * Default: { maxAttempts: 4, delayMs: 200 } (try once, retry 3 times = 4 total attempts)
+     *
+     * Timing: Attempt 1 → FAIL → wait 200ms → Attempt 2 → FAIL → wait 200ms → Attempt 3 → FAIL → wait 200ms → Attempt 4
+     *
+     * Example:
+     * ```typescript
+     * {
+     *   connectionRetry: {
+     *     maxAttempts: 2,  // Try connecting up to 2 times (1 retry)
+     *     delayMs: 500     // Wait 500ms between attempts
+     *   }
+     * }
+     * ```
+     */
+    connectionRetry?: {
+        /** Maximum number of connection attempts (default: 4, min: 1, max: 5) */
+        maxAttempts?: number;
+        /** Delay in milliseconds between retry attempts (default: 200ms) */
+        delayMs?: number;
+    };
+    /**
+     * Optional logger function for debugging
+     * If not provided, no logging will occur
+     * @param level - Log level: 'debug', 'info', 'warn', 'error'
+     * @param message - Log message
+     * @param data - Optional additional data
+     */
+    logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void;
+}
+/**
+ * Recognition Client Interface
+ *
+ * Main interface for real-time speech recognition clients.
+ * Provides methods for connection management, audio streaming, and session control.
+ */
+interface IRecognitionClient {
+    /**
+     * Connect to the WebSocket endpoint
+     * @returns Promise that resolves when connected
+     * @throws Error if connection fails or times out
+     */
+    connect(): Promise<void>;
+    /**
+     * Send audio data to the recognition service
+     * Audio is buffered locally and sent when connection is ready.
+     * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
+     */
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Stop recording and wait for final transcript
+     * The server will close the connection after sending the final transcript.
+     * @returns Promise that resolves when final transcript is received
+     */
+    stopRecording(): Promise<void>;
+    /**
+     * Force stop and immediately close connection without waiting for server
+     *
+     * WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
+     * - Does NOT wait for server to process remaining audio
+     * - Does NOT receive final transcript from server
+     * - Immediately closes WebSocket connection
+     * - Cleans up resources (buffers, listeners)
+     *
+     * Use Cases:
+     * - User explicitly cancels/abandons session
+     * - Timeout scenarios where waiting is not acceptable
+     * - Need immediate cleanup and can't wait for server
+     *
+     * RECOMMENDED: Use stopRecording() for normal shutdown.
+     * Only use this when immediate disconnection is required.
+     */
+    stopAbnormally(): void;
+    /**
+     * Get the audio utterance ID for this session
+     * Available immediately after client construction.
+     * @returns UUID v4 string identifying this recognition session
+     */
+    getAudioUtteranceId(): string;
+    /**
+     * Get the current state of the client
+     * @returns Current ClientState value
+     */
+    getState(): ClientState;
+    /**
+     * Check if WebSocket connection is open
+     * @returns true if connected and ready to communicate
+     */
+    isConnected(): boolean;
+    /**
+     * Check if client is currently connecting
+     * @returns true if connection is in progress
+     */
+    isConnecting(): boolean;
+    /**
+     * Check if client is currently stopping
+     * @returns true if stopRecording() is in progress
+     */
+    isStopping(): boolean;
+    /**
+     * Check if transcription has finished
+     * @returns true if the transcription is complete
+     */
+    isTranscriptionFinished(): boolean;
+    /**
+     * Check if the audio buffer has overflowed
+     * @returns true if the ring buffer has wrapped around
+     */
+    isBufferOverflowing(): boolean;
+    /**
+     * Get client statistics
+     * @returns Statistics about audio transmission and buffering
+     */
+    getStats(): IRecognitionClientStats;
+    /**
+     * Get the WebSocket URL being used by this client
+     * Available immediately after client construction.
+     * @returns WebSocket URL string
+     */
+    getUrl(): string;
+}
+/**
+ * Client statistics interface
+ */
+interface IRecognitionClientStats {
+    /** Total audio bytes sent to server */
+    audioBytesSent: number;
+    /** Total number of audio chunks sent */
+    audioChunksSent: number;
+    /** Total number of audio chunks buffered */
+    audioChunksBuffered: number;
+    /** Number of times the ring buffer overflowed */
+    bufferOverflowCount: number;
+    /** Current number of chunks in buffer */
+    currentBufferedChunks: number;
+    /** Whether the ring buffer has wrapped (overwritten old data) */
+    hasWrapped: boolean;
+}
+/**
+ * Configuration for RealTimeTwoWayWebSocketRecognitionClient
+ * This extends IRecognitionClientConfig and is the main configuration interface
+ * for creating a new RealTimeTwoWayWebSocketRecognitionClient instance.
+ */
+interface RealTimeTwoWayWebSocketRecognitionClientConfig extends IRecognitionClientConfig {
+}
+/**
+ * RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition
+ *
+ * Features:
+ * - Ring buffer-based audio storage with fixed memory footprint
+ * - Automatic buffering when disconnected, immediate send when connected
+ * - Buffer persists after flush (for future retry/reconnection scenarios)
+ * - Built on WebSocketAudioClient for robust protocol handling
+ * - Simple API: connect() → sendAudio() → stopRecording()
+ * - Type-safe message handling with callbacks
+ * - Automatic backpressure management
+ * - Overflow detection with buffer state tracking
+ *
+ * Example:
+ * ```typescript
+ * const client = new RealTimeTwoWayWebSocketRecognitionClient({
+ *   url: 'ws://localhost:3101/ws/v1/recognize',
+ *   onTranscript: (result) => console.log(result.finalTranscript),
+ *   onError: (error) => console.error(error),
+ *   maxBufferDurationSec: 60  // Ring buffer for 60 seconds
+ * });
+ *
+ * await client.connect();
+ *
+ * // Send audio chunks - always stored in ring buffer, sent if connected
+ * micStream.on('data', (chunk) => client.sendAudio(chunk));
+ *
+ * // Signal end of audio and wait for final results
+ * await client.stopRecording();
+ *
+ * // Server will close connection after sending finals
+ * // No manual cleanup needed - browser handles it
+ * ```
+ */
+/**
+ * Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility
+ */
+type TranscriptionResult = TranscriptionResultV1;
+/**
+ * RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition
+ *
+ * Implements IRecognitionClient interface for dependency injection and testing.
+ * Extends WebSocketAudioClient with local audio buffering and simple callback-based API.
+ */
+declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
+    private static readonly PROTOCOL_VERSION;
+    private config;
+    private audioBuffer;
+    private messageHandler;
+    private state;
+    private connectionPromise;
+    private isDebugLogEnabled;
+    private audioBytesSent;
+    private audioChunksSent;
+    private audioStatsLogInterval;
+    private lastAudioStatsLog;
+    constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig);
+    /**
+     * Internal logging helper - only logs if a logger was provided in config
+     * Debug logs are additionally gated by isDebugLogEnabled flag
+     * @param level - Log level: debug, info, warn, or error
+     * @param message - Message to log
+     * @param data - Optional additional data to log
+     */
+    private log;
+    /**
+     * Clean up internal resources to free memory
+     * Called when connection closes (normally or abnormally)
+     */
+    private cleanup;
+    connect(): Promise<void>;
+    /**
+     * Attempt to connect with retry logic
+     * Only retries on initial connection establishment, not mid-stream interruptions
+     */
+    private connectWithRetry;
+    sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    private sendAudioInternal;
+    stopRecording(): Promise<void>;
+    stopAbnormally(): void;
+    getAudioUtteranceId(): string;
+    getUrl(): string;
+    getState(): ClientState;
+    isConnected(): boolean;
+    isConnecting(): boolean;
+    isStopping(): boolean;
+    isTranscriptionFinished(): boolean;
+    isBufferOverflowing(): boolean;
+    getStats(): IRecognitionClientStats;
+    protected onConnected(): void;
+    protected onDisconnected(code: number, reason: string): void;
+    /**
+     * Get human-readable description for WebSocket close code
+     */
+    private getCloseCodeDescription;
+    protected onError(error: Event): void;
+    protected onMessage(msg: {
+        v: number;
+        type: string;
+        data: any;
+    }): void;
+    /**
+     * Handle control messages from server
+     * @param msg - Control message containing server actions
+     */
+    private handleControlMessage;
+    /**
+     * Send audio immediately to the server (without buffering)
+     * @param audioData - Audio data to send
+     */
+    private sendAudioNow;
+}
+export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };
+export type { GameContextV1, RealTimeTwoWayWebSocketRecognitionClientConfig, TranscriptionResult };