@volley/recognition-client-sdk 0.1.424 → 0.1.621

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,11 @@ declare enum RecognitionProvider {
15
15
  GOOGLE = "google",
16
16
  GEMINI_BATCH = "gemini-batch",
17
17
  OPENAI_BATCH = "openai-batch",
18
- OPENAI_REALTIME = "openai-realtime"
18
+ OPENAI_REALTIME = "openai-realtime",
19
+ MISTRAL_VOXTRAL = "mistral-voxtral",
20
+ DASHSCOPE = "dashscope",
21
+ TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
22
+ TEST_ASR_STREAMING = "test-asr-streaming"
19
23
  }
20
24
  /**
21
25
  * ASR API type - distinguishes between streaming and file-based transcription APIs
@@ -77,14 +81,31 @@ declare enum ElevenLabsModel {
77
81
  * OpenAI Realtime API transcription models
78
82
  * These are the verified `input_audio_transcription.model` values.
79
83
  * @see https://platform.openai.com/docs/guides/realtime
84
+ * @see https://platform.openai.com/docs/models/gpt-4o-transcribe
80
85
  */
81
86
  declare enum OpenAIRealtimeModel {
87
+ GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
82
88
  GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
83
89
  }
90
+ /**
91
+ * Mistral Voxtral Realtime transcription models
92
+ * @see https://docs.mistral.ai/models/voxtral-mini-transcribe-realtime-26-02
93
+ */
94
+ declare enum MistralVoxtralModel {
95
+ VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
96
+ }
97
+ /**
98
+ * DashScope Qwen-ASR Realtime transcription models
99
+ * @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
100
+ */
101
+ declare enum DashScopeModel {
102
+ QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
103
+ QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
104
+ }
84
105
  /**
85
106
  * Type alias for any model from any provider
86
107
  */
87
- type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
108
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
88
109
 
89
110
  /**
90
111
  * Audio encoding types
@@ -230,8 +251,10 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
230
251
  type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
231
252
  audioUtteranceId: z.ZodString;
232
253
  finalTranscript: z.ZodString;
254
+ finalTranscriptRaw: z.ZodString;
233
255
  finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
234
256
  pendingTranscript: z.ZodOptional<z.ZodString>;
257
+ pendingTranscriptRaw: z.ZodOptional<z.ZodString>;
235
258
  pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
236
259
  is_finished: z.ZodBoolean;
237
260
  voiceStart: z.ZodOptional<z.ZodNumber>;
@@ -241,13 +264,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
241
264
  endTimestamp: z.ZodOptional<z.ZodNumber>;
242
265
  receivedAtMs: z.ZodOptional<z.ZodNumber>;
243
266
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
267
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
244
268
  }, "strip", z.ZodTypeAny, {
245
269
  type: RecognitionResultTypeV1.TRANSCRIPTION;
246
270
  audioUtteranceId: string;
247
271
  finalTranscript: string;
272
+ finalTranscriptRaw: string;
248
273
  is_finished: boolean;
249
274
  finalTranscriptConfidence?: number | undefined;
250
275
  pendingTranscript?: string | undefined;
276
+ pendingTranscriptRaw?: string | undefined;
251
277
  pendingTranscriptConfidence?: number | undefined;
252
278
  voiceStart?: number | undefined;
253
279
  voiceDuration?: number | undefined;
@@ -256,13 +282,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
256
282
  endTimestamp?: number | undefined;
257
283
  receivedAtMs?: number | undefined;
258
284
  accumulatedAudioTimeMs?: number | undefined;
285
+ rawAudioTimeMs?: number | undefined;
259
286
  }, {
260
287
  type: RecognitionResultTypeV1.TRANSCRIPTION;
261
288
  audioUtteranceId: string;
262
289
  finalTranscript: string;
290
+ finalTranscriptRaw: string;
263
291
  is_finished: boolean;
264
292
  finalTranscriptConfidence?: number | undefined;
265
293
  pendingTranscript?: string | undefined;
294
+ pendingTranscriptRaw?: string | undefined;
266
295
  pendingTranscriptConfidence?: number | undefined;
267
296
  voiceStart?: number | undefined;
268
297
  voiceDuration?: number | undefined;
@@ -271,6 +300,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
271
300
  endTimestamp?: number | undefined;
272
301
  receivedAtMs?: number | undefined;
273
302
  accumulatedAudioTimeMs?: number | undefined;
303
+ rawAudioTimeMs?: number | undefined;
274
304
  }>;
275
305
  type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
276
306
  /**
@@ -300,11 +330,22 @@ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
300
330
  * - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
301
331
  * - EMPTY → recog.client.websocket.transcript.final_empty
302
332
  * - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
333
+ * - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
303
334
  */
304
335
  declare enum TranscriptOutcomeType {
305
336
  WITH_CONTENT = "with_content",
306
337
  EMPTY = "empty",
307
- NEVER_SENT = "never_sent"
338
+ NEVER_SENT = "never_sent",
339
+ ERROR_AUTHENTICATION = "error_authentication",
340
+ ERROR_VALIDATION = "error_validation",
341
+ ERROR_PROVIDER = "error_provider",
342
+ ERROR_TIMEOUT = "error_timeout",
343
+ ERROR_QUOTA = "error_quota",
344
+ ERROR_INTERNAL_QUOTA = "error_internal_quota",
345
+ ERROR_CONNECTION = "error_connection",
346
+ ERROR_NO_AUDIO = "error_no_audio",
347
+ ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
348
+ ERROR_UNKNOWN = "error_unknown"
308
349
  }
309
350
  /**
310
351
  * Metadata result V1 - contains metadata, timing information, and ASR config
@@ -314,6 +355,7 @@ declare enum TranscriptOutcomeType {
314
355
  declare const MetadataResultSchemaV1: z.ZodObject<{
315
356
  type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
316
357
  audioUtteranceId: z.ZodString;
358
+ connectionInitiatedAtMs: z.ZodOptional<z.ZodNumber>;
317
359
  recordingStartMs: z.ZodOptional<z.ZodNumber>;
318
360
  recordingEndMs: z.ZodOptional<z.ZodNumber>;
319
361
  transcriptEndMs: z.ZodOptional<z.ZodNumber>;
@@ -321,14 +363,53 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
321
363
  duration: z.ZodOptional<z.ZodNumber>;
322
364
  volume: z.ZodOptional<z.ZodNumber>;
323
365
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
366
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
324
367
  costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
325
368
  apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
326
369
  asrConfig: z.ZodOptional<z.ZodString>;
327
370
  rawAsrMetadata: z.ZodOptional<z.ZodString>;
328
371
  transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
372
+ audioMetrics: z.ZodOptional<z.ZodObject<{
373
+ valid: z.ZodBoolean;
374
+ audioBeginMs: z.ZodNumber;
375
+ audioEndMs: z.ZodNumber;
376
+ maxVolume: z.ZodNumber;
377
+ minVolume: z.ZodNumber;
378
+ avgVolume: z.ZodNumber;
379
+ silenceRatio: z.ZodNumber;
380
+ clippingRatio: z.ZodNumber;
381
+ snrEstimate: z.ZodNullable<z.ZodNumber>;
382
+ lastNonSilenceMs: z.ZodNumber;
383
+ timestamp: z.ZodString;
384
+ }, "strip", z.ZodTypeAny, {
385
+ valid: boolean;
386
+ audioBeginMs: number;
387
+ audioEndMs: number;
388
+ maxVolume: number;
389
+ minVolume: number;
390
+ avgVolume: number;
391
+ silenceRatio: number;
392
+ clippingRatio: number;
393
+ snrEstimate: number | null;
394
+ lastNonSilenceMs: number;
395
+ timestamp: string;
396
+ }, {
397
+ valid: boolean;
398
+ audioBeginMs: number;
399
+ audioEndMs: number;
400
+ maxVolume: number;
401
+ minVolume: number;
402
+ avgVolume: number;
403
+ silenceRatio: number;
404
+ clippingRatio: number;
405
+ snrEstimate: number | null;
406
+ lastNonSilenceMs: number;
407
+ timestamp: string;
408
+ }>>;
329
409
  }, "strip", z.ZodTypeAny, {
330
410
  type: RecognitionResultTypeV1.METADATA;
331
411
  audioUtteranceId: string;
412
+ connectionInitiatedAtMs?: number | undefined;
332
413
  recordingStartMs?: number | undefined;
333
414
  recordingEndMs?: number | undefined;
334
415
  transcriptEndMs?: number | undefined;
@@ -336,14 +417,29 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
336
417
  duration?: number | undefined;
337
418
  volume?: number | undefined;
338
419
  accumulatedAudioTimeMs?: number | undefined;
420
+ rawAudioTimeMs?: number | undefined;
339
421
  costInUSD?: number | undefined;
340
422
  apiType?: ASRApiType | undefined;
341
423
  asrConfig?: string | undefined;
342
424
  rawAsrMetadata?: string | undefined;
343
425
  transcriptOutcome?: TranscriptOutcomeType | undefined;
426
+ audioMetrics?: {
427
+ valid: boolean;
428
+ audioBeginMs: number;
429
+ audioEndMs: number;
430
+ maxVolume: number;
431
+ minVolume: number;
432
+ avgVolume: number;
433
+ silenceRatio: number;
434
+ clippingRatio: number;
435
+ snrEstimate: number | null;
436
+ lastNonSilenceMs: number;
437
+ timestamp: string;
438
+ } | undefined;
344
439
  }, {
345
440
  type: RecognitionResultTypeV1.METADATA;
346
441
  audioUtteranceId: string;
442
+ connectionInitiatedAtMs?: number | undefined;
347
443
  recordingStartMs?: number | undefined;
348
444
  recordingEndMs?: number | undefined;
349
445
  transcriptEndMs?: number | undefined;
@@ -351,11 +447,25 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
351
447
  duration?: number | undefined;
352
448
  volume?: number | undefined;
353
449
  accumulatedAudioTimeMs?: number | undefined;
450
+ rawAudioTimeMs?: number | undefined;
354
451
  costInUSD?: number | undefined;
355
452
  apiType?: ASRApiType | undefined;
356
453
  asrConfig?: string | undefined;
357
454
  rawAsrMetadata?: string | undefined;
358
455
  transcriptOutcome?: TranscriptOutcomeType | undefined;
456
+ audioMetrics?: {
457
+ valid: boolean;
458
+ audioBeginMs: number;
459
+ audioEndMs: number;
460
+ maxVolume: number;
461
+ minVolume: number;
462
+ avgVolume: number;
463
+ silenceRatio: number;
464
+ clippingRatio: number;
465
+ snrEstimate: number | null;
466
+ lastNonSilenceMs: number;
467
+ timestamp: string;
468
+ } | undefined;
359
469
  }>;
360
470
  type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
361
471
  /**
@@ -367,7 +477,10 @@ declare enum ErrorTypeV1 {
367
477
  PROVIDER_ERROR = "provider_error",
368
478
  TIMEOUT_ERROR = "timeout_error",
369
479
  QUOTA_EXCEEDED = "quota_exceeded",
480
+ INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
370
481
  CONNECTION_ERROR = "connection_error",
482
+ NO_AUDIO_ERROR = "no_audio_error",
483
+ CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
371
484
  UNKNOWN_ERROR = "unknown_error"
372
485
  }
373
486
  /**
@@ -419,6 +532,15 @@ declare enum ControlSignalTypeV1 {
419
532
  START_RECORDING = "start_recording",
420
533
  STOP_RECORDING = "stop_recording"
421
534
  }
535
+ /**
536
+ * Prefix audio mode for ASR Request V1
537
+ * Controls how prefix audio is handled during recognition
538
+ */
539
+ declare enum PrefixMode {
540
+ NONE = "none",
541
+ CLIENT = "client",
542
+ STORED = "stored"
543
+ }
422
544
  /**
423
545
  * Game context V1 - contains game state information
424
546
  */
@@ -476,13 +598,13 @@ declare enum FinalTranscriptStability {
476
598
  */
477
599
  AGGRESSIVE = "aggressive",
478
600
  /**
479
- * Balanced mode: 200ms timeout (default)
601
+ * Balanced mode: 500ms timeout (default)
480
602
  * Natural middle ground for most conversational scenarios
481
603
  * Use cases: General customer support, tech support, typical voice interactions
482
604
  */
483
605
  BALANCED = "balanced",
484
606
  /**
485
- * Conservative mode: 400ms timeout
607
+ * Conservative mode: 1000ms timeout
486
608
  * Wait longer for providers, optimized for complex/reflective speech
487
609
  * Use cases: Healthcare, complex queries, careful thought processes
488
610
  */
@@ -574,13 +696,70 @@ interface ASRRequestConfig {
574
696
  * doesn't respond with is_final=true after stopRecording().
575
697
  *
576
698
  * - aggressive: 100ms - fast response, may cut off slow providers
577
- * - balanced: 200ms - current default, good for most cases
578
- * - conservative: 400ms - wait longer for complex utterances
699
+ * - balanced: 500ms - current default, good for most cases
700
+ * - conservative: 1000ms - wait longer for complex utterances
579
701
  *
580
702
  * @default 'balanced'
581
703
  * @see FinalTranscriptStability enum for detailed descriptions
582
704
  */
583
705
  finalTranscriptStability?: FinalTranscriptStability | string;
706
+ /**
707
+ * Traffic control priority for quota slot allocation
708
+ *
709
+ * Controls which quota slots this request can use when traffic control is enabled.
710
+ * The quota system reserves a portion of slots for high-priority requests.
711
+ *
712
+ * - 'high': Can use all quota slots (reserved for critical games like song-quiz)
713
+ * - 'low': Limited to non-reserved slots (default for most requests)
714
+ *
715
+ * @default 'low'
716
+ */
717
+ priority?: 'low' | 'high';
718
+ /**
719
+ * Prefix audio injection mode
720
+ *
721
+ * Controls how prefix audio is handled:
722
+ * - 'none': No prefix audio (default)
723
+ * - 'client': Client sends PREFIX_AUDIO before user audio
724
+ * - 'stored': Server injects stored prefix audio by prefixId
725
+ *
726
+ * @default 'none'
727
+ */
728
+ prefixMode?: PrefixMode | string;
729
+ /**
730
+ * Stored prefix audio identifier
731
+ *
732
+ * Only used when prefixMode='stored'. The server will look up this ID
733
+ * in the PrefixAudioCache and inject the corresponding audio before
734
+ * user audio is processed.
735
+ *
736
+ * @example 'song_quiz'
737
+ */
738
+ prefixId?: string;
739
+ /**
740
+ * Prefix text patterns to remove from transcripts
741
+ *
742
+ * Array of prefix text variants that should be stripped from the transcript.
743
+ * This is used when prefix audio is injected and the ASR transcribes both
744
+ * the prefix and user speech - we remove the prefix portion.
745
+ *
746
+ * Multiple variants are supported because ASR may transcribe contractions
747
+ * differently (e.g., "What's this song" vs "What is this song").
748
+ *
749
+ * Matching rules:
750
+ * - Case insensitive
751
+ * - Leading/trailing whitespace trimmed
752
+ * - Multiple spaces collapsed
753
+ * - Punctuation (?.!,) stripped for matching
754
+ * - Apostrophes preserved (part of contractions)
755
+ *
756
+ * Can be set via:
757
+ * - Server-side game config (production)
758
+ * - Client-side ASRRequest (testing/override) - takes precedence
759
+ *
760
+ * @example ["What's this song", "What is this song"]
761
+ */
762
+ prefixTextToRemove?: string[];
584
763
  /**
585
764
  * Additional provider-specific options
586
765
  *
@@ -1040,6 +1219,26 @@ interface IRecognitionClient {
1040
1219
  * @returns WebSocket URL string
1041
1220
  */
1042
1221
  getUrl(): string;
1222
+ /**
1223
+ * Send game context after connection is established (for preconnect flow).
1224
+ *
1225
+ * Preconnect flow: Create client with asrRequestConfig (useContext: true) but
1226
+ * WITHOUT gameContext → call connect() → WS opens, ASRRequest sent, server
1227
+ * waits in PENDING_CONTEXT → later call sendGameContext() with slotMap →
1228
+ * server attaches provider and sends READY.
1229
+ *
1230
+ * This enables connecting early (before slotMap is known) and sending
1231
+ * game context later when question data is available.
1232
+ *
1233
+ * @param context - Game context including slotMap for keyword boosting
1234
+ */
1235
+ sendGameContext(context: GameContextV1): void;
1236
+ /**
1237
+ * Check if server has sent READY signal (provider is connected and ready for audio).
1238
+ * In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
1239
+ * @returns true if server is ready to receive audio
1240
+ */
1241
+ isServerReady(): boolean;
1043
1242
  }
1044
1243
  /**
1045
1244
  * Client statistics interface
@@ -1114,8 +1313,11 @@ type TranscriptionResult = TranscriptionResultV1;
1114
1313
  */
1115
1314
  declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
1116
1315
  private static readonly PROTOCOL_VERSION;
1316
+ private static readonly MAX_PREFIX_BUFFER_BYTES;
1117
1317
  private config;
1118
1318
  private audioBuffer;
1319
+ private prefixBuffer;
1320
+ private prefixBufferBytes;
1119
1321
  private messageHandler;
1120
1322
  private state;
1121
1323
  private connectionPromise;
@@ -1160,6 +1362,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1160
1362
  isStopping(): boolean;
1161
1363
  isTranscriptionFinished(): boolean;
1162
1364
  isBufferOverflowing(): boolean;
1365
+ isServerReady(): boolean;
1366
+ sendGameContext(context: GameContextV1): void;
1163
1367
  getStats(): IRecognitionClientStats;
1164
1368
  protected onConnected(): void;
1165
1369
  protected onDisconnected(code: number, reason: string): void;
@@ -1183,6 +1387,28 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1183
1387
  * @param audioData - Audio data to send
1184
1388
  */
1185
1389
  private sendAudioNow;
1390
+ /**
1391
+ * Send prefix audio to the server.
1392
+ * Prefix audio is sent before user audio and is used for context/priming.
1393
+ * The server will process it but adjust timing so transcripts reflect user audio timing.
1394
+ *
1395
+ * Note: Prefix audio is buffered until READY state, then flushed before user audio.
1396
+ * This ensures proper ordering even if called before server is ready.
1397
+ *
1398
+ * @param audioData - Prefix audio data (ArrayBuffer, ArrayBufferView, or Blob)
1399
+ */
1400
+ sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
1401
+ /**
1402
+ * Internal method to handle prefix audio with buffering
1403
+ * Buffers if not READY, sends immediately if READY
1404
+ */
1405
+ private sendPrefixAudioInternal;
1406
+ /**
1407
+ * Send prefix audio immediately to the server (without buffering)
1408
+ * Uses encoding offset to mark as prefix audio
1409
+ * @param audioData - Prefix audio data to send
1410
+ */
1411
+ private sendPrefixAudioNow;
1186
1412
  }
1187
1413
 
1188
1414
  export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };