@volley/recognition-client-sdk 0.1.424 → 0.1.621

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,7 +15,11 @@ declare enum RecognitionProvider {
15
15
  GOOGLE = "google",
16
16
  GEMINI_BATCH = "gemini-batch",
17
17
  OPENAI_BATCH = "openai-batch",
18
- OPENAI_REALTIME = "openai-realtime"
18
+ OPENAI_REALTIME = "openai-realtime",
19
+ MISTRAL_VOXTRAL = "mistral-voxtral",
20
+ DASHSCOPE = "dashscope",
21
+ TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
22
+ TEST_ASR_STREAMING = "test-asr-streaming"
19
23
  }
20
24
  /**
21
25
  * ASR API type - distinguishes between streaming and file-based transcription APIs
@@ -77,14 +81,31 @@ declare enum ElevenLabsModel {
77
81
  * OpenAI Realtime API transcription models
78
82
  * These are the verified `input_audio_transcription.model` values.
79
83
  * @see https://platform.openai.com/docs/guides/realtime
84
+ * @see https://platform.openai.com/docs/models/gpt-4o-transcribe
80
85
  */
81
86
  declare enum OpenAIRealtimeModel {
87
+ GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
82
88
  GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
83
89
  }
90
+ /**
91
+ * Mistral Voxtral Realtime transcription models
92
+ * @see https://docs.mistral.ai/models/voxtral-mini-transcribe-realtime-26-02
93
+ */
94
+ declare enum MistralVoxtralModel {
95
+ VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
96
+ }
97
+ /**
98
+ * DashScope Qwen-ASR Realtime transcription models
99
+ * @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
100
+ */
101
+ declare enum DashScopeModel {
102
+ QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
103
+ QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
104
+ }
84
105
  /**
85
106
  * Type alias for any model from any provider
86
107
  */
87
- type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
108
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
88
109
 
89
110
  /**
90
111
  * Audio encoding types
@@ -230,8 +251,10 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
230
251
  type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
231
252
  audioUtteranceId: z.ZodString;
232
253
  finalTranscript: z.ZodString;
254
+ finalTranscriptRaw: z.ZodString;
233
255
  finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
234
256
  pendingTranscript: z.ZodOptional<z.ZodString>;
257
+ pendingTranscriptRaw: z.ZodOptional<z.ZodString>;
235
258
  pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
236
259
  is_finished: z.ZodBoolean;
237
260
  voiceStart: z.ZodOptional<z.ZodNumber>;
@@ -241,13 +264,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
241
264
  endTimestamp: z.ZodOptional<z.ZodNumber>;
242
265
  receivedAtMs: z.ZodOptional<z.ZodNumber>;
243
266
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
267
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
244
268
  }, "strip", z.ZodTypeAny, {
245
269
  type: RecognitionResultTypeV1.TRANSCRIPTION;
246
270
  audioUtteranceId: string;
247
271
  finalTranscript: string;
272
+ finalTranscriptRaw: string;
248
273
  is_finished: boolean;
249
274
  finalTranscriptConfidence?: number | undefined;
250
275
  pendingTranscript?: string | undefined;
276
+ pendingTranscriptRaw?: string | undefined;
251
277
  pendingTranscriptConfidence?: number | undefined;
252
278
  voiceStart?: number | undefined;
253
279
  voiceDuration?: number | undefined;
@@ -256,13 +282,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
256
282
  endTimestamp?: number | undefined;
257
283
  receivedAtMs?: number | undefined;
258
284
  accumulatedAudioTimeMs?: number | undefined;
285
+ rawAudioTimeMs?: number | undefined;
259
286
  }, {
260
287
  type: RecognitionResultTypeV1.TRANSCRIPTION;
261
288
  audioUtteranceId: string;
262
289
  finalTranscript: string;
290
+ finalTranscriptRaw: string;
263
291
  is_finished: boolean;
264
292
  finalTranscriptConfidence?: number | undefined;
265
293
  pendingTranscript?: string | undefined;
294
+ pendingTranscriptRaw?: string | undefined;
266
295
  pendingTranscriptConfidence?: number | undefined;
267
296
  voiceStart?: number | undefined;
268
297
  voiceDuration?: number | undefined;
@@ -271,6 +300,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
271
300
  endTimestamp?: number | undefined;
272
301
  receivedAtMs?: number | undefined;
273
302
  accumulatedAudioTimeMs?: number | undefined;
303
+ rawAudioTimeMs?: number | undefined;
274
304
  }>;
275
305
  type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
276
306
  /**
@@ -300,11 +330,22 @@ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
300
330
  * - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
301
331
  * - EMPTY → recog.client.websocket.transcript.final_empty
302
332
  * - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
333
+ * - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
303
334
  */
304
335
  declare enum TranscriptOutcomeType {
305
336
  WITH_CONTENT = "with_content",
306
337
  EMPTY = "empty",
307
- NEVER_SENT = "never_sent"
338
+ NEVER_SENT = "never_sent",
339
+ ERROR_AUTHENTICATION = "error_authentication",
340
+ ERROR_VALIDATION = "error_validation",
341
+ ERROR_PROVIDER = "error_provider",
342
+ ERROR_TIMEOUT = "error_timeout",
343
+ ERROR_QUOTA = "error_quota",
344
+ ERROR_INTERNAL_QUOTA = "error_internal_quota",
345
+ ERROR_CONNECTION = "error_connection",
346
+ ERROR_NO_AUDIO = "error_no_audio",
347
+ ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
348
+ ERROR_UNKNOWN = "error_unknown"
308
349
  }
309
350
  /**
310
351
  * Metadata result V1 - contains metadata, timing information, and ASR config
@@ -314,6 +355,7 @@ declare enum TranscriptOutcomeType {
314
355
  declare const MetadataResultSchemaV1: z.ZodObject<{
315
356
  type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
316
357
  audioUtteranceId: z.ZodString;
358
+ connectionInitiatedAtMs: z.ZodOptional<z.ZodNumber>;
317
359
  recordingStartMs: z.ZodOptional<z.ZodNumber>;
318
360
  recordingEndMs: z.ZodOptional<z.ZodNumber>;
319
361
  transcriptEndMs: z.ZodOptional<z.ZodNumber>;
@@ -321,14 +363,53 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
321
363
  duration: z.ZodOptional<z.ZodNumber>;
322
364
  volume: z.ZodOptional<z.ZodNumber>;
323
365
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
366
+ rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
324
367
  costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
325
368
  apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
326
369
  asrConfig: z.ZodOptional<z.ZodString>;
327
370
  rawAsrMetadata: z.ZodOptional<z.ZodString>;
328
371
  transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
372
+ audioMetrics: z.ZodOptional<z.ZodObject<{
373
+ valid: z.ZodBoolean;
374
+ audioBeginMs: z.ZodNumber;
375
+ audioEndMs: z.ZodNumber;
376
+ maxVolume: z.ZodNumber;
377
+ minVolume: z.ZodNumber;
378
+ avgVolume: z.ZodNumber;
379
+ silenceRatio: z.ZodNumber;
380
+ clippingRatio: z.ZodNumber;
381
+ snrEstimate: z.ZodNullable<z.ZodNumber>;
382
+ lastNonSilenceMs: z.ZodNumber;
383
+ timestamp: z.ZodString;
384
+ }, "strip", z.ZodTypeAny, {
385
+ valid: boolean;
386
+ audioBeginMs: number;
387
+ audioEndMs: number;
388
+ maxVolume: number;
389
+ minVolume: number;
390
+ avgVolume: number;
391
+ silenceRatio: number;
392
+ clippingRatio: number;
393
+ snrEstimate: number | null;
394
+ lastNonSilenceMs: number;
395
+ timestamp: string;
396
+ }, {
397
+ valid: boolean;
398
+ audioBeginMs: number;
399
+ audioEndMs: number;
400
+ maxVolume: number;
401
+ minVolume: number;
402
+ avgVolume: number;
403
+ silenceRatio: number;
404
+ clippingRatio: number;
405
+ snrEstimate: number | null;
406
+ lastNonSilenceMs: number;
407
+ timestamp: string;
408
+ }>>;
329
409
  }, "strip", z.ZodTypeAny, {
330
410
  type: RecognitionResultTypeV1.METADATA;
331
411
  audioUtteranceId: string;
412
+ connectionInitiatedAtMs?: number | undefined;
332
413
  recordingStartMs?: number | undefined;
333
414
  recordingEndMs?: number | undefined;
334
415
  transcriptEndMs?: number | undefined;
@@ -336,14 +417,29 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
336
417
  duration?: number | undefined;
337
418
  volume?: number | undefined;
338
419
  accumulatedAudioTimeMs?: number | undefined;
420
+ rawAudioTimeMs?: number | undefined;
339
421
  costInUSD?: number | undefined;
340
422
  apiType?: ASRApiType | undefined;
341
423
  asrConfig?: string | undefined;
342
424
  rawAsrMetadata?: string | undefined;
343
425
  transcriptOutcome?: TranscriptOutcomeType | undefined;
426
+ audioMetrics?: {
427
+ valid: boolean;
428
+ audioBeginMs: number;
429
+ audioEndMs: number;
430
+ maxVolume: number;
431
+ minVolume: number;
432
+ avgVolume: number;
433
+ silenceRatio: number;
434
+ clippingRatio: number;
435
+ snrEstimate: number | null;
436
+ lastNonSilenceMs: number;
437
+ timestamp: string;
438
+ } | undefined;
344
439
  }, {
345
440
  type: RecognitionResultTypeV1.METADATA;
346
441
  audioUtteranceId: string;
442
+ connectionInitiatedAtMs?: number | undefined;
347
443
  recordingStartMs?: number | undefined;
348
444
  recordingEndMs?: number | undefined;
349
445
  transcriptEndMs?: number | undefined;
@@ -351,11 +447,25 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
351
447
  duration?: number | undefined;
352
448
  volume?: number | undefined;
353
449
  accumulatedAudioTimeMs?: number | undefined;
450
+ rawAudioTimeMs?: number | undefined;
354
451
  costInUSD?: number | undefined;
355
452
  apiType?: ASRApiType | undefined;
356
453
  asrConfig?: string | undefined;
357
454
  rawAsrMetadata?: string | undefined;
358
455
  transcriptOutcome?: TranscriptOutcomeType | undefined;
456
+ audioMetrics?: {
457
+ valid: boolean;
458
+ audioBeginMs: number;
459
+ audioEndMs: number;
460
+ maxVolume: number;
461
+ minVolume: number;
462
+ avgVolume: number;
463
+ silenceRatio: number;
464
+ clippingRatio: number;
465
+ snrEstimate: number | null;
466
+ lastNonSilenceMs: number;
467
+ timestamp: string;
468
+ } | undefined;
359
469
  }>;
360
470
  type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
361
471
  /**
@@ -367,7 +477,10 @@ declare enum ErrorTypeV1 {
367
477
  PROVIDER_ERROR = "provider_error",
368
478
  TIMEOUT_ERROR = "timeout_error",
369
479
  QUOTA_EXCEEDED = "quota_exceeded",
480
+ INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
370
481
  CONNECTION_ERROR = "connection_error",
482
+ NO_AUDIO_ERROR = "no_audio_error",
483
+ CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
371
484
  UNKNOWN_ERROR = "unknown_error"
372
485
  }
373
486
  /**
@@ -915,6 +1028,36 @@ declare const RecognitionExceptionSchema: z.ZodDiscriminatedUnion<"errorType", [
915
1028
  attempts?: number | undefined;
916
1029
  url?: string | undefined;
917
1030
  underlyingError?: string | undefined;
1031
+ }>, z.ZodObject<{
1032
+ code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
1033
+ message: z.ZodString;
1034
+ description: z.ZodOptional<z.ZodString>;
1035
+ audioUtteranceId: z.ZodOptional<z.ZodString>;
1036
+ timestamp: z.ZodOptional<z.ZodNumber>;
1037
+ errorType: z.ZodLiteral<ErrorTypeV1.CIRCUIT_BREAKER_OPEN>;
1038
+ isImmediatelyAvailable: z.ZodLiteral<true>;
1039
+ provider: z.ZodOptional<z.ZodNativeEnum<typeof RecognitionProvider>>;
1040
+ model: z.ZodOptional<z.ZodString>;
1041
+ }, "strip", z.ZodTypeAny, {
1042
+ message: string;
1043
+ errorType: ErrorTypeV1.CIRCUIT_BREAKER_OPEN;
1044
+ isImmediatelyAvailable: true;
1045
+ code?: string | number | undefined;
1046
+ description?: string | undefined;
1047
+ audioUtteranceId?: string | undefined;
1048
+ timestamp?: number | undefined;
1049
+ provider?: RecognitionProvider | undefined;
1050
+ model?: string | undefined;
1051
+ }, {
1052
+ message: string;
1053
+ errorType: ErrorTypeV1.CIRCUIT_BREAKER_OPEN;
1054
+ isImmediatelyAvailable: true;
1055
+ code?: string | number | undefined;
1056
+ description?: string | undefined;
1057
+ audioUtteranceId?: string | undefined;
1058
+ timestamp?: number | undefined;
1059
+ provider?: RecognitionProvider | undefined;
1060
+ model?: string | undefined;
918
1061
  }>, z.ZodObject<{
919
1062
  code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
920
1063
  message: z.ZodString;
@@ -980,6 +1123,15 @@ declare enum ControlSignalTypeV1 {
980
1123
  START_RECORDING = "start_recording",
981
1124
  STOP_RECORDING = "stop_recording"
982
1125
  }
1126
+ /**
1127
+ * Prefix audio mode for ASR Request V1
1128
+ * Controls how prefix audio is handled during recognition
1129
+ */
1130
+ declare enum PrefixMode {
1131
+ NONE = "none",
1132
+ CLIENT = "client",
1133
+ STORED = "stored"
1134
+ }
983
1135
  /**
984
1136
  * SlotMap - A strongly typed map from slot names to lists of values
985
1137
  * Used for entity extraction and slot filling in voice interactions
@@ -1030,6 +1182,38 @@ declare const ASRRequestSchemaV1: z.ZodObject<{
1030
1182
  interimResults: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
1031
1183
  useContext: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
1032
1184
  finalTranscriptStability: z.ZodOptional<z.ZodString>;
1185
+ priority: z.ZodDefault<z.ZodOptional<z.ZodEnum<["low", "high"]>>>;
1186
+ fallbackModels: z.ZodOptional<z.ZodArray<z.ZodObject<{
1187
+ provider: z.ZodString;
1188
+ model: z.ZodOptional<z.ZodString>;
1189
+ language: z.ZodOptional<z.ZodString>;
1190
+ sampleRate: z.ZodOptional<z.ZodNumber>;
1191
+ encoding: z.ZodOptional<z.ZodNumber>;
1192
+ interimResults: z.ZodOptional<z.ZodBoolean>;
1193
+ useContext: z.ZodOptional<z.ZodBoolean>;
1194
+ finalTranscriptStability: z.ZodOptional<z.ZodString>;
1195
+ }, "strip", z.ZodTypeAny, {
1196
+ provider: string;
1197
+ model?: string | undefined;
1198
+ language?: string | undefined;
1199
+ sampleRate?: number | undefined;
1200
+ encoding?: number | undefined;
1201
+ interimResults?: boolean | undefined;
1202
+ useContext?: boolean | undefined;
1203
+ finalTranscriptStability?: string | undefined;
1204
+ }, {
1205
+ provider: string;
1206
+ model?: string | undefined;
1207
+ language?: string | undefined;
1208
+ sampleRate?: number | undefined;
1209
+ encoding?: number | undefined;
1210
+ interimResults?: boolean | undefined;
1211
+ useContext?: boolean | undefined;
1212
+ finalTranscriptStability?: string | undefined;
1213
+ }>, "many">>;
1214
+ prefixMode: z.ZodDefault<z.ZodOptional<z.ZodNativeEnum<typeof PrefixMode>>>;
1215
+ prefixId: z.ZodOptional<z.ZodString>;
1216
+ prefixTextToRemove: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
1033
1217
  debugCommand: z.ZodOptional<z.ZodObject<{
1034
1218
  enableDebugLog: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
1035
1219
  enableAudioStorage: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
@@ -1047,16 +1231,30 @@ declare const ASRRequestSchemaV1: z.ZodObject<{
1047
1231
  enablePilotModels?: boolean | undefined;
1048
1232
  }>>;
1049
1233
  }, "strip", z.ZodTypeAny, {
1234
+ type: RecognitionContextTypeV1.ASR_REQUEST;
1050
1235
  provider: string;
1051
1236
  language: string;
1052
1237
  sampleRate: number;
1053
1238
  encoding: number;
1054
1239
  interimResults: boolean;
1055
1240
  useContext: boolean;
1056
- type: RecognitionContextTypeV1.ASR_REQUEST;
1241
+ priority: "low" | "high";
1242
+ prefixMode: PrefixMode;
1057
1243
  audioUtteranceId?: string | undefined;
1058
1244
  model?: string | undefined;
1059
1245
  finalTranscriptStability?: string | undefined;
1246
+ fallbackModels?: {
1247
+ provider: string;
1248
+ model?: string | undefined;
1249
+ language?: string | undefined;
1250
+ sampleRate?: number | undefined;
1251
+ encoding?: number | undefined;
1252
+ interimResults?: boolean | undefined;
1253
+ useContext?: boolean | undefined;
1254
+ finalTranscriptStability?: string | undefined;
1255
+ }[] | undefined;
1256
+ prefixId?: string | undefined;
1257
+ prefixTextToRemove?: string[] | undefined;
1060
1258
  debugCommand?: {
1061
1259
  enableDebugLog: boolean;
1062
1260
  enableAudioStorage: boolean;
@@ -1064,16 +1262,30 @@ declare const ASRRequestSchemaV1: z.ZodObject<{
1064
1262
  enablePilotModels: boolean;
1065
1263
  } | undefined;
1066
1264
  }, {
1265
+ type: RecognitionContextTypeV1.ASR_REQUEST;
1067
1266
  provider: string;
1068
1267
  language: string;
1069
1268
  sampleRate: number;
1070
1269
  encoding: number;
1071
- type: RecognitionContextTypeV1.ASR_REQUEST;
1072
1270
  audioUtteranceId?: string | undefined;
1073
1271
  model?: string | undefined;
1074
1272
  interimResults?: boolean | undefined;
1075
1273
  useContext?: boolean | undefined;
1076
1274
  finalTranscriptStability?: string | undefined;
1275
+ priority?: "low" | "high" | undefined;
1276
+ fallbackModels?: {
1277
+ provider: string;
1278
+ model?: string | undefined;
1279
+ language?: string | undefined;
1280
+ sampleRate?: number | undefined;
1281
+ encoding?: number | undefined;
1282
+ interimResults?: boolean | undefined;
1283
+ useContext?: boolean | undefined;
1284
+ finalTranscriptStability?: string | undefined;
1285
+ }[] | undefined;
1286
+ prefixMode?: PrefixMode | undefined;
1287
+ prefixId?: string | undefined;
1288
+ prefixTextToRemove?: string[] | undefined;
1077
1289
  debugCommand?: {
1078
1290
  enableDebugLog?: boolean | undefined;
1079
1291
  enableAudioStorage?: boolean | undefined;
@@ -1110,13 +1322,13 @@ declare enum FinalTranscriptStability {
1110
1322
  */
1111
1323
  AGGRESSIVE = "aggressive",
1112
1324
  /**
1113
- * Balanced mode: 200ms timeout (default)
1325
+ * Balanced mode: 500ms timeout (default)
1114
1326
  * Natural middle ground for most conversational scenarios
1115
1327
  * Use cases: General customer support, tech support, typical voice interactions
1116
1328
  */
1117
1329
  BALANCED = "balanced",
1118
1330
  /**
1119
- * Conservative mode: 400ms timeout
1331
+ * Conservative mode: 1000ms timeout
1120
1332
  * Wait longer for providers, optimized for complex/reflective speech
1121
1333
  * Use cases: Healthcare, complex queries, careful thought processes
1122
1334
  */
@@ -1208,13 +1420,70 @@ interface ASRRequestConfig {
1208
1420
  * doesn't respond with is_final=true after stopRecording().
1209
1421
  *
1210
1422
  * - aggressive: 100ms - fast response, may cut off slow providers
1211
- * - balanced: 200ms - current default, good for most cases
1212
- * - conservative: 400ms - wait longer for complex utterances
1423
+ * - balanced: 500ms - current default, good for most cases
1424
+ * - conservative: 1000ms - wait longer for complex utterances
1213
1425
  *
1214
1426
  * @default 'balanced'
1215
1427
  * @see FinalTranscriptStability enum for detailed descriptions
1216
1428
  */
1217
1429
  finalTranscriptStability?: FinalTranscriptStability | string;
1430
+ /**
1431
+ * Traffic control priority for quota slot allocation
1432
+ *
1433
+ * Controls which quota slots this request can use when traffic control is enabled.
1434
+ * The quota system reserves a portion of slots for high-priority requests.
1435
+ *
1436
+ * - 'high': Can use all quota slots (reserved for critical games like song-quiz)
1437
+ * - 'low': Limited to non-reserved slots (default for most requests)
1438
+ *
1439
+ * @default 'low'
1440
+ */
1441
+ priority?: 'low' | 'high';
1442
+ /**
1443
+ * Prefix audio injection mode
1444
+ *
1445
+ * Controls how prefix audio is handled:
1446
+ * - 'none': No prefix audio (default)
1447
+ * - 'client': Client sends PREFIX_AUDIO before user audio
1448
+ * - 'stored': Server injects stored prefix audio by prefixId
1449
+ *
1450
+ * @default 'none'
1451
+ */
1452
+ prefixMode?: PrefixMode | string;
1453
+ /**
1454
+ * Stored prefix audio identifier
1455
+ *
1456
+ * Only used when prefixMode='stored'. The server will look up this ID
1457
+ * in the PrefixAudioCache and inject the corresponding audio before
1458
+ * user audio is processed.
1459
+ *
1460
+ * @example 'song_quiz'
1461
+ */
1462
+ prefixId?: string;
1463
+ /**
1464
+ * Prefix text patterns to remove from transcripts
1465
+ *
1466
+ * Array of prefix text variants that should be stripped from the transcript.
1467
+ * This is used when prefix audio is injected and the ASR transcribes both
1468
+ * the prefix and user speech - we remove the prefix portion.
1469
+ *
1470
+ * Multiple variants are supported because ASR may transcribe contractions
1471
+ * differently (e.g., "What's this song" vs "What is this song").
1472
+ *
1473
+ * Matching rules:
1474
+ * - Case insensitive
1475
+ * - Leading/trailing whitespace trimmed
1476
+ * - Multiple spaces collapsed
1477
+ * - Punctuation (?.!,) stripped for matching
1478
+ * - Apostrophes preserved (part of contractions)
1479
+ *
1480
+ * Can be set via:
1481
+ * - Server-side game config (production)
1482
+ * - Client-side ASRRequest (testing/override) - takes precedence
1483
+ *
1484
+ * @example ["What's this song", "What is this song"]
1485
+ */
1486
+ prefixTextToRemove?: string[];
1218
1487
  /**
1219
1488
  * Additional provider-specific options
1220
1489
  *
@@ -1710,6 +1979,26 @@ interface IRecognitionClient {
1710
1979
  * @returns WebSocket URL string
1711
1980
  */
1712
1981
  getUrl(): string;
1982
+ /**
1983
+ * Send game context after connection is established (for preconnect flow).
1984
+ *
1985
+ * Preconnect flow: Create client with asrRequestConfig (useContext: true) but
1986
+ * WITHOUT gameContext → call connect() → WS opens, ASRRequest sent, server
1987
+ * waits in PENDING_CONTEXT → later call sendGameContext() with slotMap →
1988
+ * server attaches provider and sends READY.
1989
+ *
1990
+ * This enables connecting early (before slotMap is known) and sending
1991
+ * game context later when question data is available.
1992
+ *
1993
+ * @param context - Game context including slotMap for keyword boosting
1994
+ */
1995
+ sendGameContext(context: GameContextV1): void;
1996
+ /**
1997
+ * Check if server has sent READY signal (provider is connected and ready for audio).
1998
+ * In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
1999
+ * @returns true if server is ready to receive audio
2000
+ */
2001
+ isServerReady(): boolean;
1713
2002
  }
1714
2003
  /**
1715
2004
  * Client statistics interface
@@ -1790,8 +2079,11 @@ type TranscriptionResult = TranscriptionResultV1;
1790
2079
  */
1791
2080
  declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
1792
2081
  private static readonly PROTOCOL_VERSION;
2082
+ private static readonly MAX_PREFIX_BUFFER_BYTES;
1793
2083
  private config;
1794
2084
  private audioBuffer;
2085
+ private prefixBuffer;
2086
+ private prefixBufferBytes;
1795
2087
  private messageHandler;
1796
2088
  private state;
1797
2089
  private connectionPromise;
@@ -1836,6 +2128,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1836
2128
  isStopping(): boolean;
1837
2129
  isTranscriptionFinished(): boolean;
1838
2130
  isBufferOverflowing(): boolean;
2131
+ isServerReady(): boolean;
2132
+ sendGameContext(context: GameContextV1): void;
1839
2133
  getStats(): IRecognitionClientStats;
1840
2134
  protected onConnected(): void;
1841
2135
  protected onDisconnected(code: number, reason: string): void;
@@ -1859,6 +2153,28 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
1859
2153
  * @param audioData - Audio data to send
1860
2154
  */
1861
2155
  private sendAudioNow;
2156
+ /**
2157
+ * Send prefix audio to the server.
2158
+ * Prefix audio is sent before user audio and is used for context/priming.
2159
+ * The server will process it but adjust timing so transcripts reflect user audio timing.
2160
+ *
2161
+ * Note: Prefix audio is buffered until READY state, then flushed before user audio.
2162
+ * This ensures proper ordering even if called before server is ready.
2163
+ *
2164
+ * @param audioData - Prefix audio data (ArrayBuffer, ArrayBufferView, or Blob)
2165
+ */
2166
+ sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
2167
+ /**
2168
+ * Internal method to handle prefix audio with buffering
2169
+ * Buffers if not READY, sends immediately if READY
2170
+ */
2171
+ private sendPrefixAudioInternal;
2172
+ /**
2173
+ * Send prefix audio immediately to the server (without buffering)
2174
+ * Uses encoding offset to mark as prefix audio
2175
+ * @param audioData - Prefix audio data to send
2176
+ */
2177
+ private sendPrefixAudioNow;
1862
2178
  }
1863
2179
 
1864
2180
  /**
@@ -2244,6 +2560,20 @@ interface ISimplifiedVGFRecognitionClient {
2244
2560
  * Check if the audio buffer has overflowed
2245
2561
  */
2246
2562
  isBufferOverflowing(): boolean;
2563
+ /**
2564
+ * Send game context after connection is established (for preconnect flow).
2565
+ *
2566
+ * Preconnect flow: Create client with asrRequestConfig (useContext: true) but
2567
+ * WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
2568
+ *
2569
+ * @param context - Game context including slotMap for keyword boosting
2570
+ */
2571
+ sendGameContext(context: GameContextV1): void;
2572
+ /**
2573
+ * Check if server has sent READY signal (provider connected, ready for audio).
2574
+ * In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
2575
+ */
2576
+ isServerReady(): boolean;
2247
2577
  /**
2248
2578
  * Get the audio utterance ID for this session
2249
2579
  */
@@ -2282,6 +2612,8 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
2282
2612
  isStopping(): boolean;
2283
2613
  isTranscriptionFinished(): boolean;
2284
2614
  isBufferOverflowing(): boolean;
2615
+ sendGameContext(context: GameContextV1): void;
2616
+ isServerReady(): boolean;
2285
2617
  getVGFState(): RecognitionState;
2286
2618
  private isTerminalStatus;
2287
2619
  private notifyStateChange;
@@ -2396,5 +2728,5 @@ declare function getRecognitionConductorHttpBase(stage?: Stage | string | null |
2396
2728
  declare function getRecognitionConductorWsBase(stage?: Stage | string | null | undefined): string;
2397
2729
  declare function getRecognitionConductorHost(stage?: Stage | string | null | undefined): string;
2398
2730
 
2399
- export { AudioEncoding, ClientControlActionV1, ClientState, ConfigBuilder, ConnectionError, ControlSignalTypeV1 as ControlSignal, ControlSignalTypeV1, DeepgramModel, ElevenLabsModel, ErrorTypeV1, FinalTranscriptStability, FireworksModel, GeminiModel, GoogleModel, Language, OpenAIModel, RECOGNITION_CONDUCTOR_BASES, RECOGNITION_SERVICE_BASES, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1, RecognitionError, RecognitionProvider, RecognitionResultTypeV1, RecognitionVGFStateSchema, RecordingStatus, STAGES, SampleRate, SimplifiedVGFRecognitionClient, TimeoutError, TranscriptionStatus, ValidationError, createClient, createClientWithBuilder, createDefaultASRConfig, createInitialRecognitionState, createSimplifiedVGFClient, getRecognitionConductorBase, getRecognitionConductorHost, getRecognitionConductorHttpBase, getRecognitionConductorWsBase, getRecognitionServiceBase, getRecognitionServiceHost, getRecognitionServiceHttpBase, getRecognitionServiceWsBase, getUserFriendlyMessage, isExceptionImmediatelyAvailable, isNormalDisconnection, isValidRecordingStatusTransition, normalizeStage, resetRecognitionVGFState };
2731
+ export { AudioEncoding, ClientControlActionV1, ClientState, ConfigBuilder, ConnectionError, ControlSignalTypeV1 as ControlSignal, ControlSignalTypeV1, DashScopeModel, DeepgramModel, ElevenLabsModel, ErrorTypeV1, FinalTranscriptStability, FireworksModel, GeminiModel, GoogleModel, Language, MistralVoxtralModel, OpenAIModel, RECOGNITION_CONDUCTOR_BASES, RECOGNITION_SERVICE_BASES, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1, RecognitionError, RecognitionProvider, RecognitionResultTypeV1, RecognitionVGFStateSchema, RecordingStatus, STAGES, SampleRate, SimplifiedVGFRecognitionClient, TimeoutError, TranscriptionStatus, ValidationError, createClient, createClientWithBuilder, createDefaultASRConfig, createInitialRecognitionState, createSimplifiedVGFClient, getRecognitionConductorBase, getRecognitionConductorHost, getRecognitionConductorHttpBase, getRecognitionConductorWsBase, getRecognitionServiceBase, getRecognitionServiceHost, getRecognitionServiceHttpBase, getRecognitionServiceWsBase, getUserFriendlyMessage, isExceptionImmediatelyAvailable, isNormalDisconnection, isValidRecordingStatusTransition, normalizeStage, resetRecognitionVGFState };
2400
2732
  export type { ASRRequestConfig, ASRRequestV1, AuthenticationException, ConnectionException, ErrorResultV1, FunctionCallResultV1, GameContextV1, IRecognitionClient, IRecognitionClientConfig, IRecognitionClientStats, ISimplifiedVGFRecognitionClient, MetadataResultV1, ProviderException, QuotaExceededException, RealTimeTwoWayWebSocketRecognitionClientConfig, RecognitionCallbackUrl, RecognitionException, RecognitionState, RecordingStatusType, SimplifiedVGFClientConfig, SlotMap, Stage, TimeoutException, TranscriptionResult, TranscriptionResultV1, TranscriptionStatusType, UnknownException, ValidationException };
package/dist/index.d.ts CHANGED
@@ -11,6 +11,6 @@ export { type RecognitionState, RecognitionVGFStateSchema, RecordingStatus, Tran
11
11
  export { resetRecognitionVGFState } from './vgf-recognition-mapper.js';
12
12
  export { AudioEncoding } from '@recog/websocket';
13
13
  export { type GameContextV1, type SlotMap, RecognitionContextTypeV1, ControlSignalTypeV1, ControlSignalTypeV1 as ControlSignal, // Alias for backward compatibility
14
- type TranscriptionResultV1, type FunctionCallResultV1, type MetadataResultV1, type ErrorResultV1, RecognitionResultTypeV1, ClientControlActionV1, type ASRRequestConfig, type ASRRequestV1, FinalTranscriptStability, createDefaultASRConfig, RecognitionProvider, DeepgramModel, ElevenLabsModel, FireworksModel, GoogleModel, GeminiModel, OpenAIModel, Language, SampleRate, STAGES, type Stage } from '@recog/shared-types';
14
+ type TranscriptionResultV1, type FunctionCallResultV1, type MetadataResultV1, type ErrorResultV1, RecognitionResultTypeV1, ClientControlActionV1, type ASRRequestConfig, type ASRRequestV1, FinalTranscriptStability, createDefaultASRConfig, RecognitionProvider, DeepgramModel, ElevenLabsModel, FireworksModel, GoogleModel, GeminiModel, OpenAIModel, MistralVoxtralModel, DashScopeModel, Language, SampleRate, STAGES, type Stage } from '@recog/shared-types';
15
15
  export { getRecognitionServiceBase, getRecognitionServiceHttpBase, getRecognitionServiceWsBase, getRecognitionServiceHost, getRecognitionConductorBase, getRecognitionConductorHttpBase, getRecognitionConductorWsBase, getRecognitionConductorHost, normalizeStage, RECOGNITION_SERVICE_BASES, RECOGNITION_CONDUCTOR_BASES } from '@recog/shared-config';
16
16
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,wCAAwC,EACxC,KAAK,8CAA8C,EACnD,KAAK,mBAAmB,EACxB,qBAAqB,EACtB,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,wBAAwB,EAC7B,KAAK,uBAAuB,EAC5B,KAAK,sBAAsB,EAC3B,WAAW,EACZ,MAAM,+BAA+B,CAAC;AAGvC,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAGpD,OAAO,EAAE,YAAY,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAC;AAGrE,OAAO,EACL,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EAChB,MAAM,aAAa,CAAC;AAGrB,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAGlD,YAAY,EACV,oBAAoB,EACpB,mBAAmB,EACnB,gBAAgB,EAChB,mBAAmB,EACnB,uBAAuB,EACvB,iBAAiB,EACjB,sBAAsB,EACtB,gBAAgB,EACjB,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,+BAA+B,EAC/B,sBAAsB,EACvB,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,8BAA8B,EAC9B,yBAAyB,EACzB,KAAK,+BAA+B,EACpC,KAAK,yBAAyB,EAC/B,MAAM,wCAAwC,CAAC;AAEhD,OAAO,EACL,KAAK,gBAAgB,EACrB,yBAAyB,EACzB,eAAe,EACf,mBAAmB,EACnB,KAAK,mBAAmB,EACxB,KAAK,uBAAuB,EAC5B,6BAA6B,EAC7B,gCAAgC,EACjC,MAAM,4BAA4B,CAAC;AAEpC,OAAO,EAAE,wBAAwB,EAAE,MAAM,6BAA6B,CAAC;AAGvE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGjD,OAAO,EAEL,KAAK,aAAa,EAClB,KAAK,OAAO,EACZ,wBAAwB,EACxB,mBAAmB,EACnB,mBAAmB,IAAI,aAAa,EAAG,mCAAmC;AAG1E,KAAK,qBAAqB,EAC1B,KAAK,oBAAoB,EACzB,KAAK,gBAAgB,EACrB,KAAK,aAAa,EAClB,uBAAuB,EACvB,qBAAqB,EAGrB,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,wBAAwB,EACxB,sBAAsB,EACtB,mBAAmB,EACnB,aAAa,EACb,eAAe,EACf,cAAc,EACd,WAAW,EACX,WAAW,EACX,WAAW,EACX,QAAQ,EACR,UAAU,EAGV,MAAM,EACN,KAAK,KAAK,EACX,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,yBAAyB,EACzB,6BAA6B,EAC7B,2BAA2B,EAC3B,yBAAyB,EACzB,2BAA2B,EAC3B,+BAA+B,EAC/B,6BAA6B,EAC7B,2BAA2B,EAC3B,cAAc,EACd,yBAAyB,EACzB,2BAA2B,EAC5B,MAAM,sBAAsB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EACL,wCAAwC,EACxC,KAAK,8CAA8C,EACnD,KAAK,mBAAmB,EACxB,qBAAqB,EACtB,MAAM,yBAAyB,CAAC;AAGjC,OAAO,EACL,KAAK,kBAAkB,EACvB,KAAK,wBAAwB,EAC7B,KAAK,uBAAuB,EAC5B,KAAK,sBAAsB,EAC3B,WAAW,EACZ,MAAM,+BAA+B,CAAC;AAGvC,OAAO,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAGpD,OAAO,EAAE,YAAY,EAAE,uBAAuB,EAAE,MAAM,cAAc,CAAC;AAGrE,OAAO,EACL,gBAAgB,EAChB,eAAe,EACf,YAAY,EACZ,eAAe,EAChB,MAAM,aAAa,CAAC;AAGrB,OAAO,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAGlD,YAAY,EACV,oBAAoB,EACpB,mBAAmB,EACnB,gBAAgB,EAChB,mBAAmB,EACnB,uBAAuB,EACvB,iBAAiB,EACjB,sBAAsB,EACtB,gBAAgB,EACjB,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,+BAA+B,EAC/B,sBAAsB,EACvB,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,8BAA8B,EAC9B,yBAAyB,EACzB,KAAK,+BAA+B,EACpC,KAAK,yBAAyB,EAC/B,MAAM,wCAAwC,CAAC;AAEhD,OAAO,EACL,KAAK,gBAAgB,EACrB,yBAAyB,EACzB,eAAe,EACf,mBAAmB,EACnB,KAAK,mBAAmB,EACxB,KAAK,uBAAuB,EAC5B,6BAA6B,EAC7B,gCAAgC,EACjC,MAAM,4BAA4B,CAAC;AAEpC,OAAO,EAAE,wBAAwB,EAAE,MAAM,6BAA6B,CAAC;AAGvE,OAAO,EAAE,aAAa,EAAE,MAAM,kBAAkB,CAAC;AAGjD,OAAO,EAEL,KAAK,aAAa,EAClB,KAAK,OAAO,EACZ,wBAAwB,EACxB,mBAAmB,EACnB,mBAAmB,IAAI,aAAa,EAAG,mCAAmC;AAG1E,KAAK,qBAAqB,EAC1B,KAAK,oBAAoB,EACzB,KAAK,gBAAgB,EACrB,KAAK,aAAa,EAClB,uBAAuB,EACvB,qBAAqB,EAGrB,KAAK,gBAAgB,EACrB,KAAK,YAAY,EACjB,wBAAwB,EACxB,sBAAsB,EACtB,mBAAmB,EACnB,aAAa,EACb,eAAe,EACf,cAAc,EACd,WAAW,EACX,WAAW,EACX,WAAW,EACX,mBAAmB,EACnB,cAAc,EACd,QAAQ,EACR,UAAU,EAGV,MAAM,EACN,KAAK,KAAK,EACX,MAAM,qBAAqB,CAAC;AAG7B,OAAO,EACL,yBAAyB,EACzB,6BAA6B,EAC7B,2BAA2B,EAC3B,yBAAyB,EACzB,2BAA2B,EAC3B,+BAA+B,EAC/B,6BAA6B,EAC7B,2BAA2B,EAC3B,cAAc,EACd,yBAAyB,EACzB,2BAA2B,EAC5B,MAAM,sBAAsB,CAAC"}