@volley/recognition-client-sdk 0.1.424 → 0.1.622

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,11 @@ declare enum RecognitionProvider {
15
15
  GOOGLE = "google",
16
16
  GEMINI_BATCH = "gemini-batch",
17
17
  OPENAI_BATCH = "openai-batch",
18
- OPENAI_REALTIME = "openai-realtime"
18
+ OPENAI_REALTIME = "openai-realtime",
19
+ MISTRAL_VOXTRAL = "mistral-voxtral",
20
+ DASHSCOPE = "dashscope",
21
+ TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
22
+ TEST_ASR_STREAMING = "test-asr-streaming"
19
23
  }
20
24
  /**
21
25
  * ASR API type - distinguishes between streaming and file-based transcription APIs
@@ -77,14 +81,31 @@ declare enum ElevenLabsModel {
77
81
  * OpenAI Realtime API transcription models
78
82
  * These are the verified `input_audio_transcription.model` values.
79
83
  * @see https://platform.openai.com/docs/guides/realtime
84
+ * @see https://platform.openai.com/docs/models/gpt-4o-transcribe
80
85
  */
81
86
  declare enum OpenAIRealtimeModel {
87
+ GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
82
88
  GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
83
89
  }
90
+ /**
91
+ * Mistral Voxtral Realtime transcription models
92
+ * @see https://docs.mistral.ai/models/voxtral-mini-transcribe-realtime-26-02
93
+ */
94
+ declare enum MistralVoxtralModel {
95
+ VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
96
+ }
97
+ /**
98
+ * DashScope Qwen-ASR Realtime transcription models
99
+ * @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
100
+ */
101
+ declare enum DashScopeModel {
102
+ QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
103
+ QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
104
+ }
84
105
  /**
85
106
  * Type alias for any model from any provider
86
107
  */
87
- type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
108
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
88
109
 
89
110
  /**
90
111
  * Audio encoding types
@@ -230,47 +251,59 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
230
251
  type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
231
252
  audioUtteranceId: z.ZodString;
232
253
  finalTranscript: z.ZodString;
254
+ finalTranscriptRaw: z.ZodString;
233
255
  finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
234
256
  pendingTranscript: z.ZodOptional<z.ZodString>;
257
+ pendingTranscriptRaw: z.ZodOptional<z.ZodString>;
235
258
  pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
236
259
  is_finished: z.ZodBoolean;
237
260
  voiceStart: z.ZodOptional<z.ZodNumber>;
238
261
  voiceDuration: z.ZodOptional<z.ZodNumber>;
239
262
  voiceEnd: z.ZodOptional<z.ZodNumber>;
263
+ lastNonSilence: z.ZodOptional<z.ZodNumber>;
240
264
  startTimestamp: z.ZodOptional<z.ZodNumber>;
241
265
  endTimestamp: z.ZodOptional<z.ZodNumber>;
242
266
  receivedAtMs: z.ZodOptional<z.ZodNumber>;
243
267
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
268
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
244
269
  }, "strip", z.ZodTypeAny, {
245
270
  type: RecognitionResultTypeV1.TRANSCRIPTION;
246
271
  audioUtteranceId: string;
247
272
  finalTranscript: string;
273
+ finalTranscriptRaw: string;
248
274
  is_finished: boolean;
249
275
  finalTranscriptConfidence?: number | undefined;
250
276
  pendingTranscript?: string | undefined;
277
+ pendingTranscriptRaw?: string | undefined;
251
278
  pendingTranscriptConfidence?: number | undefined;
252
279
  voiceStart?: number | undefined;
253
280
  voiceDuration?: number | undefined;
254
281
  voiceEnd?: number | undefined;
282
+ lastNonSilence?: number | undefined;
255
283
  startTimestamp?: number | undefined;
256
284
  endTimestamp?: number | undefined;
257
285
  receivedAtMs?: number | undefined;
258
286
  accumulatedAudioTimeMs?: number | undefined;
287
+ rawAudioTimeMs?: number | undefined;
259
288
  }, {
260
289
  type: RecognitionResultTypeV1.TRANSCRIPTION;
261
290
  audioUtteranceId: string;
262
291
  finalTranscript: string;
292
+ finalTranscriptRaw: string;
263
293
  is_finished: boolean;
264
294
  finalTranscriptConfidence?: number | undefined;
265
295
  pendingTranscript?: string | undefined;
296
+ pendingTranscriptRaw?: string | undefined;
266
297
  pendingTranscriptConfidence?: number | undefined;
267
298
  voiceStart?: number | undefined;
268
299
  voiceDuration?: number | undefined;
269
300
  voiceEnd?: number | undefined;
301
+ lastNonSilence?: number | undefined;
270
302
  startTimestamp?: number | undefined;
271
303
  endTimestamp?: number | undefined;
272
304
  receivedAtMs?: number | undefined;
273
305
  accumulatedAudioTimeMs?: number | undefined;
306
+ rawAudioTimeMs?: number | undefined;
274
307
  }>;
275
308
  type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
276
309
  /**
@@ -300,11 +333,22 @@ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
300
333
  * - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
301
334
  * - EMPTY → recog.client.websocket.transcript.final_empty
302
335
  * - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
336
+ * - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
303
337
  */
304
338
  declare enum TranscriptOutcomeType {
305
339
  WITH_CONTENT = "with_content",
306
340
  EMPTY = "empty",
307
- NEVER_SENT = "never_sent"
341
+ NEVER_SENT = "never_sent",
342
+ ERROR_AUTHENTICATION = "error_authentication",
343
+ ERROR_VALIDATION = "error_validation",
344
+ ERROR_PROVIDER = "error_provider",
345
+ ERROR_TIMEOUT = "error_timeout",
346
+ ERROR_QUOTA = "error_quota",
347
+ ERROR_INTERNAL_QUOTA = "error_internal_quota",
348
+ ERROR_CONNECTION = "error_connection",
349
+ ERROR_NO_AUDIO = "error_no_audio",
350
+ ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
351
+ ERROR_UNKNOWN = "error_unknown"
308
352
  }
309
353
  /**
310
354
  * Metadata result V1 - contains metadata, timing information, and ASR config
@@ -314,6 +358,7 @@ declare enum TranscriptOutcomeType {
314
358
  declare const MetadataResultSchemaV1: z.ZodObject<{
315
359
  type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
316
360
  audioUtteranceId: z.ZodString;
361
+ connectionInitiatedAtMs: z.ZodOptional<z.ZodNumber>;
317
362
  recordingStartMs: z.ZodOptional<z.ZodNumber>;
318
363
  recordingEndMs: z.ZodOptional<z.ZodNumber>;
319
364
  transcriptEndMs: z.ZodOptional<z.ZodNumber>;
@@ -321,14 +366,53 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
321
366
  duration: z.ZodOptional<z.ZodNumber>;
322
367
  volume: z.ZodOptional<z.ZodNumber>;
323
368
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
369
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
324
370
  costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
325
371
  apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
326
372
  asrConfig: z.ZodOptional<z.ZodString>;
327
373
  rawAsrMetadata: z.ZodOptional<z.ZodString>;
328
374
  transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
375
+ audioMetrics: z.ZodOptional<z.ZodObject<{
376
+ valid: z.ZodBoolean;
377
+ audioBeginMs: z.ZodNumber;
378
+ audioEndMs: z.ZodNumber;
379
+ maxVolume: z.ZodNumber;
380
+ minVolume: z.ZodNumber;
381
+ avgVolume: z.ZodNumber;
382
+ silenceRatio: z.ZodNumber;
383
+ clippingRatio: z.ZodNumber;
384
+ snrEstimate: z.ZodNullable<z.ZodNumber>;
385
+ lastNonSilenceMs: z.ZodNumber;
386
+ timestamp: z.ZodString;
387
+ }, "strip", z.ZodTypeAny, {
388
+ valid: boolean;
389
+ audioBeginMs: number;
390
+ audioEndMs: number;
391
+ maxVolume: number;
392
+ minVolume: number;
393
+ avgVolume: number;
394
+ silenceRatio: number;
395
+ clippingRatio: number;
396
+ snrEstimate: number | null;
397
+ lastNonSilenceMs: number;
398
+ timestamp: string;
399
+ }, {
400
+ valid: boolean;
401
+ audioBeginMs: number;
402
+ audioEndMs: number;
403
+ maxVolume: number;
404
+ minVolume: number;
405
+ avgVolume: number;
406
+ silenceRatio: number;
407
+ clippingRatio: number;
408
+ snrEstimate: number | null;
409
+ lastNonSilenceMs: number;
410
+ timestamp: string;
411
+ }>>;
329
412
  }, "strip", z.ZodTypeAny, {
330
413
  type: RecognitionResultTypeV1.METADATA;
331
414
  audioUtteranceId: string;
415
+ connectionInitiatedAtMs?: number | undefined;
332
416
  recordingStartMs?: number | undefined;
333
417
  recordingEndMs?: number | undefined;
334
418
  transcriptEndMs?: number | undefined;
@@ -336,14 +420,29 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
336
420
  duration?: number | undefined;
337
421
  volume?: number | undefined;
338
422
  accumulatedAudioTimeMs?: number | undefined;
423
+ rawAudioTimeMs?: number | undefined;
339
424
  costInUSD?: number | undefined;
340
425
  apiType?: ASRApiType | undefined;
341
426
  asrConfig?: string | undefined;
342
427
  rawAsrMetadata?: string | undefined;
343
428
  transcriptOutcome?: TranscriptOutcomeType | undefined;
429
+ audioMetrics?: {
430
+ valid: boolean;
431
+ audioBeginMs: number;
432
+ audioEndMs: number;
433
+ maxVolume: number;
434
+ minVolume: number;
435
+ avgVolume: number;
436
+ silenceRatio: number;
437
+ clippingRatio: number;
438
+ snrEstimate: number | null;
439
+ lastNonSilenceMs: number;
440
+ timestamp: string;
441
+ } | undefined;
344
442
  }, {
345
443
  type: RecognitionResultTypeV1.METADATA;
346
444
  audioUtteranceId: string;
445
+ connectionInitiatedAtMs?: number | undefined;
347
446
  recordingStartMs?: number | undefined;
348
447
  recordingEndMs?: number | undefined;
349
448
  transcriptEndMs?: number | undefined;
@@ -351,11 +450,25 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
351
450
  duration?: number | undefined;
352
451
  volume?: number | undefined;
353
452
  accumulatedAudioTimeMs?: number | undefined;
453
+ rawAudioTimeMs?: number | undefined;
354
454
  costInUSD?: number | undefined;
355
455
  apiType?: ASRApiType | undefined;
356
456
  asrConfig?: string | undefined;
357
457
  rawAsrMetadata?: string | undefined;
358
458
  transcriptOutcome?: TranscriptOutcomeType | undefined;
459
+ audioMetrics?: {
460
+ valid: boolean;
461
+ audioBeginMs: number;
462
+ audioEndMs: number;
463
+ maxVolume: number;
464
+ minVolume: number;
465
+ avgVolume: number;
466
+ silenceRatio: number;
467
+ clippingRatio: number;
468
+ snrEstimate: number | null;
469
+ lastNonSilenceMs: number;
470
+ timestamp: string;
471
+ } | undefined;
359
472
  }>;
360
473
  type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
361
474
  /**
@@ -367,7 +480,10 @@ declare enum ErrorTypeV1 {
367
480
  PROVIDER_ERROR = "provider_error",
368
481
  TIMEOUT_ERROR = "timeout_error",
369
482
  QUOTA_EXCEEDED = "quota_exceeded",
483
+ INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
370
484
  CONNECTION_ERROR = "connection_error",
485
+ NO_AUDIO_ERROR = "no_audio_error",
486
+ CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
371
487
  UNKNOWN_ERROR = "unknown_error"
372
488
  }
373
489
  /**
@@ -419,6 +535,15 @@ declare enum ControlSignalTypeV1 {
419
535
  START_RECORDING = "start_recording",
420
536
  STOP_RECORDING = "stop_recording"
421
537
  }
538
+ /**
539
+ * Prefix audio mode for ASR Request V1
540
+ * Controls how prefix audio is handled during recognition
541
+ */
542
+ declare enum PrefixMode {
543
+ NONE = "none",
544
+ CLIENT = "client",
545
+ STORED = "stored"
546
+ }
422
547
  /**
423
548
  * Game context V1 - contains game state information
424
549
  */
@@ -476,13 +601,13 @@ declare enum FinalTranscriptStability {
476
601
  */
477
602
  AGGRESSIVE = "aggressive",
478
603
  /**
479
- * Balanced mode: 200ms timeout (default)
604
+ * Balanced mode: 500ms timeout (default)
480
605
  * Natural middle ground for most conversational scenarios
481
606
  * Use cases: General customer support, tech support, typical voice interactions
482
607
  */
483
608
  BALANCED = "balanced",
484
609
  /**
485
- * Conservative mode: 400ms timeout
610
+ * Conservative mode: 1000ms timeout
486
611
  * Wait longer for providers, optimized for complex/reflective speech
487
612
  * Use cases: Healthcare, complex queries, careful thought processes
488
613
  */
@@ -574,13 +699,70 @@ interface ASRRequestConfig {
574
699
  * doesn't respond with is_final=true after stopRecording().
575
700
  *
576
701
  * - aggressive: 100ms - fast response, may cut off slow providers
577
- * - balanced: 200ms - current default, good for most cases
578
- * - conservative: 400ms - wait longer for complex utterances
702
+ * - balanced: 500ms - current default, good for most cases
703
+ * - conservative: 1000ms - wait longer for complex utterances
579
704
  *
580
705
  * @default 'balanced'
581
706
  * @see FinalTranscriptStability enum for detailed descriptions
582
707
  */
583
708
  finalTranscriptStability?: FinalTranscriptStability | string;
709
+ /**
710
+ * Traffic control priority for quota slot allocation
711
+ *
712
+ * Controls which quota slots this request can use when traffic control is enabled.
713
+ * The quota system reserves a portion of slots for high-priority requests.
714
+ *
715
+ * - 'high': Can use all quota slots (reserved for critical games like song-quiz)
716
+ * - 'low': Limited to non-reserved slots (default for most requests)
717
+ *
718
+ * @default 'low'
719
+ */
720
+ priority?: 'low' | 'high';
721
+ /**
722
+ * Prefix audio injection mode
723
+ *
724
+ * Controls how prefix audio is handled:
725
+ * - 'none': No prefix audio (default)
726
+ * - 'client': Client sends PREFIX_AUDIO before user audio
727
+ * - 'stored': Server injects stored prefix audio by prefixId
728
+ *
729
+ * @default 'none'
730
+ */
731
+ prefixMode?: PrefixMode | string;
732
+ /**
733
+ * Stored prefix audio identifier
734
+ *
735
+ * Only used when prefixMode='stored'. The server will look up this ID
736
+ * in the PrefixAudioCache and inject the corresponding audio before
737
+ * user audio is processed.
738
+ *
739
+ * @example 'song_quiz'
740
+ */
741
+ prefixId?: string;
742
+ /**
743
+ * Prefix text patterns to remove from transcripts
744
+ *
745
+ * Array of prefix text variants that should be stripped from the transcript.
746
+ * This is used when prefix audio is injected and the ASR transcribes both
747
+ * the prefix and user speech - we remove the prefix portion.
748
+ *
749
+ * Multiple variants are supported because ASR may transcribe contractions
750
+ * differently (e.g., "What's this song" vs "What is this song").
751
+ *
752
+ * Matching rules:
753
+ * - Case insensitive
754
+ * - Leading/trailing whitespace trimmed
755
+ * - Multiple spaces collapsed
756
+ * - Punctuation (?.!,) stripped for matching
757
+ * - Apostrophes preserved (part of contractions)
758
+ *
759
+ * Can be set via:
760
+ * - Server-side game config (production)
761
+ * - Client-side ASRRequest (testing/override) - takes precedence
762
+ *
763
+ * @example ["What's this song", "What is this song"]
764
+ */
765
+ prefixTextToRemove?: string[];
584
766
  /**
585
767
  * Additional provider-specific options
586
768
  *
@@ -1040,6 +1222,26 @@ interface IRecognitionClient {
1040
1222
  * @returns WebSocket URL string
1041
1223
  */
1042
1224
  getUrl(): string;
1225
+ /**
1226
+ * Send game context after connection is established (for preconnect flow).
1227
+ *
1228
+ * Preconnect flow: Create client with asrRequestConfig (useContext: true) but
1229
+ * WITHOUT gameContext → call connect() → WS opens, ASRRequest sent, server
1230
+ * waits in PENDING_CONTEXT → later call sendGameContext() with slotMap →
1231
+ * server attaches provider and sends READY.
1232
+ *
1233
+ * This enables connecting early (before slotMap is known) and sending
1234
+ * game context later when question data is available.
1235
+ *
1236
+ * @param context - Game context including slotMap for keyword boosting
1237
+ */
1238
+ sendGameContext(context: GameContextV1): void;
1239
+ /**
1240
+ * Check if server has sent READY signal (provider is connected and ready for audio).
1241
+ * In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
1242
+ * @returns true if server is ready to receive audio
1243
+ */
1244
+ isServerReady(): boolean;
1043
1245
  }
1044
1246
  /**
1045
1247
  * Client statistics interface
@@ -1114,8 +1316,11 @@ type TranscriptionResult = TranscriptionResultV1;
1114
1316
  */
1115
1317
  declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
1116
1318
  private static readonly PROTOCOL_VERSION;
1319
+ private static readonly MAX_PREFIX_BUFFER_BYTES;
1117
1320
  private config;
1118
1321
  private audioBuffer;
1322
+ private prefixBuffer;
1323
+ private prefixBufferBytes;
1119
1324
  private messageHandler;
1120
1325
  private state;
1121
1326
  private connectionPromise;
@@ -1160,6 +1365,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1160
1365
  isStopping(): boolean;
1161
1366
  isTranscriptionFinished(): boolean;
1162
1367
  isBufferOverflowing(): boolean;
1368
+ isServerReady(): boolean;
1369
+ sendGameContext(context: GameContextV1): void;
1163
1370
  getStats(): IRecognitionClientStats;
1164
1371
  protected onConnected(): void;
1165
1372
  protected onDisconnected(code: number, reason: string): void;
@@ -1183,6 +1390,28 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1183
1390
  * @param audioData - Audio data to send
1184
1391
  */
1185
1392
  private sendAudioNow;
1393
+ /**
1394
+ * Send prefix audio to the server.
1395
+ * Prefix audio is sent before user audio and is used for context/priming.
1396
+ * The server will process it but adjust timing so transcripts reflect user audio timing.
1397
+ *
1398
+ * Note: Prefix audio is buffered until READY state, then flushed before user audio.
1399
+ * This ensures proper ordering even if called before server is ready.
1400
+ *
1401
+ * @param audioData - Prefix audio data (ArrayBuffer, ArrayBufferView, or Blob)
1402
+ */
1403
+ sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
1404
+ /**
1405
+ * Internal method to handle prefix audio with buffering
1406
+ * Buffers if not READY, sends immediately if READY
1407
+ */
1408
+ private sendPrefixAudioInternal;
1409
+ /**
1410
+ * Send prefix audio immediately to the server (without buffering)
1411
+ * Uses encoding offset to mark as prefix audio
1412
+ * @param audioData - Prefix audio data to send
1413
+ */
1414
+ private sendPrefixAudioNow;
1186
1415
  }
1187
1416
 
1188
1417
  export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };