@volley/recognition-client-sdk 0.1.424 → 0.1.622
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +236 -7
- package/dist/index.bundled.d.ts +393 -52
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +268 -15
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +236 -14
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +28 -1
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +20 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +17 -0
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/vgf-recognition-mapper.d.ts.map +1 -1
- package/dist/vgf-recognition-state.d.ts +6 -0
- package/dist/vgf-recognition-state.d.ts.map +1 -1
- package/package.json +8 -8
- package/src/index.ts +3 -0
- package/src/recognition-client.ts +158 -8
- package/src/recognition-client.types.ts +23 -0
- package/src/simplified-vgf-recognition-client.integration.spec.ts +15 -3
- package/src/simplified-vgf-recognition-client.ts +28 -1
- package/src/utils/audio-ring-buffer.spec.ts +335 -0
- package/src/vgf-recognition-mapper.ts +19 -1
- package/src/vgf-recognition-state.ts +4 -0
|
@@ -15,7 +15,11 @@ declare enum RecognitionProvider {
|
|
|
15
15
|
GOOGLE = "google",
|
|
16
16
|
GEMINI_BATCH = "gemini-batch",
|
|
17
17
|
OPENAI_BATCH = "openai-batch",
|
|
18
|
-
OPENAI_REALTIME = "openai-realtime"
|
|
18
|
+
OPENAI_REALTIME = "openai-realtime",
|
|
19
|
+
MISTRAL_VOXTRAL = "mistral-voxtral",
|
|
20
|
+
DASHSCOPE = "dashscope",
|
|
21
|
+
TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
|
|
22
|
+
TEST_ASR_STREAMING = "test-asr-streaming"
|
|
19
23
|
}
|
|
20
24
|
/**
|
|
21
25
|
* ASR API type - distinguishes between streaming and file-based transcription APIs
|
|
@@ -77,14 +81,31 @@ declare enum ElevenLabsModel {
|
|
|
77
81
|
* OpenAI Realtime API transcription models
|
|
78
82
|
* These are the verified `input_audio_transcription.model` values.
|
|
79
83
|
* @see https://platform.openai.com/docs/guides/realtime
|
|
84
|
+
* @see https://platform.openai.com/docs/models/gpt-4o-transcribe
|
|
80
85
|
*/
|
|
81
86
|
declare enum OpenAIRealtimeModel {
|
|
87
|
+
GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
|
|
82
88
|
GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
|
|
83
89
|
}
|
|
90
|
+
/**
|
|
91
|
+
* Mistral Voxtral Realtime transcription models
|
|
92
|
+
* @see https://docs.mistral.ai/models/voxtral-mini-transcribe-realtime-26-02
|
|
93
|
+
*/
|
|
94
|
+
declare enum MistralVoxtralModel {
|
|
95
|
+
VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* DashScope Qwen-ASR Realtime transcription models
|
|
99
|
+
* @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
|
|
100
|
+
*/
|
|
101
|
+
declare enum DashScopeModel {
|
|
102
|
+
QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
|
|
103
|
+
QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
|
|
104
|
+
}
|
|
84
105
|
/**
|
|
85
106
|
* Type alias for any model from any provider
|
|
86
107
|
*/
|
|
87
|
-
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
|
|
108
|
+
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
|
|
88
109
|
|
|
89
110
|
/**
|
|
90
111
|
* Audio encoding types
|
|
@@ -230,47 +251,59 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
230
251
|
type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
|
|
231
252
|
audioUtteranceId: z.ZodString;
|
|
232
253
|
finalTranscript: z.ZodString;
|
|
254
|
+
finalTranscriptRaw: z.ZodString;
|
|
233
255
|
finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
|
|
234
256
|
pendingTranscript: z.ZodOptional<z.ZodString>;
|
|
257
|
+
pendingTranscriptRaw: z.ZodOptional<z.ZodString>;
|
|
235
258
|
pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
|
|
236
259
|
is_finished: z.ZodBoolean;
|
|
237
260
|
voiceStart: z.ZodOptional<z.ZodNumber>;
|
|
238
261
|
voiceDuration: z.ZodOptional<z.ZodNumber>;
|
|
239
262
|
voiceEnd: z.ZodOptional<z.ZodNumber>;
|
|
263
|
+
lastNonSilence: z.ZodOptional<z.ZodNumber>;
|
|
240
264
|
startTimestamp: z.ZodOptional<z.ZodNumber>;
|
|
241
265
|
endTimestamp: z.ZodOptional<z.ZodNumber>;
|
|
242
266
|
receivedAtMs: z.ZodOptional<z.ZodNumber>;
|
|
243
267
|
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
268
|
+
rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
244
269
|
}, "strip", z.ZodTypeAny, {
|
|
245
270
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
246
271
|
audioUtteranceId: string;
|
|
247
272
|
finalTranscript: string;
|
|
273
|
+
finalTranscriptRaw: string;
|
|
248
274
|
is_finished: boolean;
|
|
249
275
|
finalTranscriptConfidence?: number | undefined;
|
|
250
276
|
pendingTranscript?: string | undefined;
|
|
277
|
+
pendingTranscriptRaw?: string | undefined;
|
|
251
278
|
pendingTranscriptConfidence?: number | undefined;
|
|
252
279
|
voiceStart?: number | undefined;
|
|
253
280
|
voiceDuration?: number | undefined;
|
|
254
281
|
voiceEnd?: number | undefined;
|
|
282
|
+
lastNonSilence?: number | undefined;
|
|
255
283
|
startTimestamp?: number | undefined;
|
|
256
284
|
endTimestamp?: number | undefined;
|
|
257
285
|
receivedAtMs?: number | undefined;
|
|
258
286
|
accumulatedAudioTimeMs?: number | undefined;
|
|
287
|
+
rawAudioTimeMs?: number | undefined;
|
|
259
288
|
}, {
|
|
260
289
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
261
290
|
audioUtteranceId: string;
|
|
262
291
|
finalTranscript: string;
|
|
292
|
+
finalTranscriptRaw: string;
|
|
263
293
|
is_finished: boolean;
|
|
264
294
|
finalTranscriptConfidence?: number | undefined;
|
|
265
295
|
pendingTranscript?: string | undefined;
|
|
296
|
+
pendingTranscriptRaw?: string | undefined;
|
|
266
297
|
pendingTranscriptConfidence?: number | undefined;
|
|
267
298
|
voiceStart?: number | undefined;
|
|
268
299
|
voiceDuration?: number | undefined;
|
|
269
300
|
voiceEnd?: number | undefined;
|
|
301
|
+
lastNonSilence?: number | undefined;
|
|
270
302
|
startTimestamp?: number | undefined;
|
|
271
303
|
endTimestamp?: number | undefined;
|
|
272
304
|
receivedAtMs?: number | undefined;
|
|
273
305
|
accumulatedAudioTimeMs?: number | undefined;
|
|
306
|
+
rawAudioTimeMs?: number | undefined;
|
|
274
307
|
}>;
|
|
275
308
|
type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
|
|
276
309
|
/**
|
|
@@ -300,11 +333,22 @@ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
|
|
|
300
333
|
* - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
|
|
301
334
|
* - EMPTY → recog.client.websocket.transcript.final_empty
|
|
302
335
|
* - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
|
|
336
|
+
* - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
|
|
303
337
|
*/
|
|
304
338
|
declare enum TranscriptOutcomeType {
|
|
305
339
|
WITH_CONTENT = "with_content",
|
|
306
340
|
EMPTY = "empty",
|
|
307
|
-
NEVER_SENT = "never_sent"
|
|
341
|
+
NEVER_SENT = "never_sent",
|
|
342
|
+
ERROR_AUTHENTICATION = "error_authentication",
|
|
343
|
+
ERROR_VALIDATION = "error_validation",
|
|
344
|
+
ERROR_PROVIDER = "error_provider",
|
|
345
|
+
ERROR_TIMEOUT = "error_timeout",
|
|
346
|
+
ERROR_QUOTA = "error_quota",
|
|
347
|
+
ERROR_INTERNAL_QUOTA = "error_internal_quota",
|
|
348
|
+
ERROR_CONNECTION = "error_connection",
|
|
349
|
+
ERROR_NO_AUDIO = "error_no_audio",
|
|
350
|
+
ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
|
|
351
|
+
ERROR_UNKNOWN = "error_unknown"
|
|
308
352
|
}
|
|
309
353
|
/**
|
|
310
354
|
* Metadata result V1 - contains metadata, timing information, and ASR config
|
|
@@ -314,6 +358,7 @@ declare enum TranscriptOutcomeType {
|
|
|
314
358
|
declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
315
359
|
type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
|
|
316
360
|
audioUtteranceId: z.ZodString;
|
|
361
|
+
connectionInitiatedAtMs: z.ZodOptional<z.ZodNumber>;
|
|
317
362
|
recordingStartMs: z.ZodOptional<z.ZodNumber>;
|
|
318
363
|
recordingEndMs: z.ZodOptional<z.ZodNumber>;
|
|
319
364
|
transcriptEndMs: z.ZodOptional<z.ZodNumber>;
|
|
@@ -321,14 +366,53 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
321
366
|
duration: z.ZodOptional<z.ZodNumber>;
|
|
322
367
|
volume: z.ZodOptional<z.ZodNumber>;
|
|
323
368
|
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
369
|
+
rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
324
370
|
costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
|
|
325
371
|
apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
|
|
326
372
|
asrConfig: z.ZodOptional<z.ZodString>;
|
|
327
373
|
rawAsrMetadata: z.ZodOptional<z.ZodString>;
|
|
328
374
|
transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
|
|
375
|
+
audioMetrics: z.ZodOptional<z.ZodObject<{
|
|
376
|
+
valid: z.ZodBoolean;
|
|
377
|
+
audioBeginMs: z.ZodNumber;
|
|
378
|
+
audioEndMs: z.ZodNumber;
|
|
379
|
+
maxVolume: z.ZodNumber;
|
|
380
|
+
minVolume: z.ZodNumber;
|
|
381
|
+
avgVolume: z.ZodNumber;
|
|
382
|
+
silenceRatio: z.ZodNumber;
|
|
383
|
+
clippingRatio: z.ZodNumber;
|
|
384
|
+
snrEstimate: z.ZodNullable<z.ZodNumber>;
|
|
385
|
+
lastNonSilenceMs: z.ZodNumber;
|
|
386
|
+
timestamp: z.ZodString;
|
|
387
|
+
}, "strip", z.ZodTypeAny, {
|
|
388
|
+
valid: boolean;
|
|
389
|
+
audioBeginMs: number;
|
|
390
|
+
audioEndMs: number;
|
|
391
|
+
maxVolume: number;
|
|
392
|
+
minVolume: number;
|
|
393
|
+
avgVolume: number;
|
|
394
|
+
silenceRatio: number;
|
|
395
|
+
clippingRatio: number;
|
|
396
|
+
snrEstimate: number | null;
|
|
397
|
+
lastNonSilenceMs: number;
|
|
398
|
+
timestamp: string;
|
|
399
|
+
}, {
|
|
400
|
+
valid: boolean;
|
|
401
|
+
audioBeginMs: number;
|
|
402
|
+
audioEndMs: number;
|
|
403
|
+
maxVolume: number;
|
|
404
|
+
minVolume: number;
|
|
405
|
+
avgVolume: number;
|
|
406
|
+
silenceRatio: number;
|
|
407
|
+
clippingRatio: number;
|
|
408
|
+
snrEstimate: number | null;
|
|
409
|
+
lastNonSilenceMs: number;
|
|
410
|
+
timestamp: string;
|
|
411
|
+
}>>;
|
|
329
412
|
}, "strip", z.ZodTypeAny, {
|
|
330
413
|
type: RecognitionResultTypeV1.METADATA;
|
|
331
414
|
audioUtteranceId: string;
|
|
415
|
+
connectionInitiatedAtMs?: number | undefined;
|
|
332
416
|
recordingStartMs?: number | undefined;
|
|
333
417
|
recordingEndMs?: number | undefined;
|
|
334
418
|
transcriptEndMs?: number | undefined;
|
|
@@ -336,14 +420,29 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
336
420
|
duration?: number | undefined;
|
|
337
421
|
volume?: number | undefined;
|
|
338
422
|
accumulatedAudioTimeMs?: number | undefined;
|
|
423
|
+
rawAudioTimeMs?: number | undefined;
|
|
339
424
|
costInUSD?: number | undefined;
|
|
340
425
|
apiType?: ASRApiType | undefined;
|
|
341
426
|
asrConfig?: string | undefined;
|
|
342
427
|
rawAsrMetadata?: string | undefined;
|
|
343
428
|
transcriptOutcome?: TranscriptOutcomeType | undefined;
|
|
429
|
+
audioMetrics?: {
|
|
430
|
+
valid: boolean;
|
|
431
|
+
audioBeginMs: number;
|
|
432
|
+
audioEndMs: number;
|
|
433
|
+
maxVolume: number;
|
|
434
|
+
minVolume: number;
|
|
435
|
+
avgVolume: number;
|
|
436
|
+
silenceRatio: number;
|
|
437
|
+
clippingRatio: number;
|
|
438
|
+
snrEstimate: number | null;
|
|
439
|
+
lastNonSilenceMs: number;
|
|
440
|
+
timestamp: string;
|
|
441
|
+
} | undefined;
|
|
344
442
|
}, {
|
|
345
443
|
type: RecognitionResultTypeV1.METADATA;
|
|
346
444
|
audioUtteranceId: string;
|
|
445
|
+
connectionInitiatedAtMs?: number | undefined;
|
|
347
446
|
recordingStartMs?: number | undefined;
|
|
348
447
|
recordingEndMs?: number | undefined;
|
|
349
448
|
transcriptEndMs?: number | undefined;
|
|
@@ -351,11 +450,25 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
351
450
|
duration?: number | undefined;
|
|
352
451
|
volume?: number | undefined;
|
|
353
452
|
accumulatedAudioTimeMs?: number | undefined;
|
|
453
|
+
rawAudioTimeMs?: number | undefined;
|
|
354
454
|
costInUSD?: number | undefined;
|
|
355
455
|
apiType?: ASRApiType | undefined;
|
|
356
456
|
asrConfig?: string | undefined;
|
|
357
457
|
rawAsrMetadata?: string | undefined;
|
|
358
458
|
transcriptOutcome?: TranscriptOutcomeType | undefined;
|
|
459
|
+
audioMetrics?: {
|
|
460
|
+
valid: boolean;
|
|
461
|
+
audioBeginMs: number;
|
|
462
|
+
audioEndMs: number;
|
|
463
|
+
maxVolume: number;
|
|
464
|
+
minVolume: number;
|
|
465
|
+
avgVolume: number;
|
|
466
|
+
silenceRatio: number;
|
|
467
|
+
clippingRatio: number;
|
|
468
|
+
snrEstimate: number | null;
|
|
469
|
+
lastNonSilenceMs: number;
|
|
470
|
+
timestamp: string;
|
|
471
|
+
} | undefined;
|
|
359
472
|
}>;
|
|
360
473
|
type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
|
|
361
474
|
/**
|
|
@@ -367,7 +480,10 @@ declare enum ErrorTypeV1 {
|
|
|
367
480
|
PROVIDER_ERROR = "provider_error",
|
|
368
481
|
TIMEOUT_ERROR = "timeout_error",
|
|
369
482
|
QUOTA_EXCEEDED = "quota_exceeded",
|
|
483
|
+
INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
|
|
370
484
|
CONNECTION_ERROR = "connection_error",
|
|
485
|
+
NO_AUDIO_ERROR = "no_audio_error",
|
|
486
|
+
CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
|
|
371
487
|
UNKNOWN_ERROR = "unknown_error"
|
|
372
488
|
}
|
|
373
489
|
/**
|
|
@@ -419,6 +535,15 @@ declare enum ControlSignalTypeV1 {
|
|
|
419
535
|
START_RECORDING = "start_recording",
|
|
420
536
|
STOP_RECORDING = "stop_recording"
|
|
421
537
|
}
|
|
538
|
+
/**
|
|
539
|
+
* Prefix audio mode for ASR Request V1
|
|
540
|
+
* Controls how prefix audio is handled during recognition
|
|
541
|
+
*/
|
|
542
|
+
declare enum PrefixMode {
|
|
543
|
+
NONE = "none",
|
|
544
|
+
CLIENT = "client",
|
|
545
|
+
STORED = "stored"
|
|
546
|
+
}
|
|
422
547
|
/**
|
|
423
548
|
* Game context V1 - contains game state information
|
|
424
549
|
*/
|
|
@@ -476,13 +601,13 @@ declare enum FinalTranscriptStability {
|
|
|
476
601
|
*/
|
|
477
602
|
AGGRESSIVE = "aggressive",
|
|
478
603
|
/**
|
|
479
|
-
* Balanced mode:
|
|
604
|
+
* Balanced mode: 500ms timeout (default)
|
|
480
605
|
* Natural middle ground for most conversational scenarios
|
|
481
606
|
* Use cases: General customer support, tech support, typical voice interactions
|
|
482
607
|
*/
|
|
483
608
|
BALANCED = "balanced",
|
|
484
609
|
/**
|
|
485
|
-
* Conservative mode:
|
|
610
|
+
* Conservative mode: 1000ms timeout
|
|
486
611
|
* Wait longer for providers, optimized for complex/reflective speech
|
|
487
612
|
* Use cases: Healthcare, complex queries, careful thought processes
|
|
488
613
|
*/
|
|
@@ -574,13 +699,70 @@ interface ASRRequestConfig {
|
|
|
574
699
|
* doesn't respond with is_final=true after stopRecording().
|
|
575
700
|
*
|
|
576
701
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
577
|
-
* - balanced:
|
|
578
|
-
* - conservative:
|
|
702
|
+
* - balanced: 500ms - current default, good for most cases
|
|
703
|
+
* - conservative: 1000ms - wait longer for complex utterances
|
|
579
704
|
*
|
|
580
705
|
* @default 'balanced'
|
|
581
706
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
582
707
|
*/
|
|
583
708
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
709
|
+
/**
|
|
710
|
+
* Traffic control priority for quota slot allocation
|
|
711
|
+
*
|
|
712
|
+
* Controls which quota slots this request can use when traffic control is enabled.
|
|
713
|
+
* The quota system reserves a portion of slots for high-priority requests.
|
|
714
|
+
*
|
|
715
|
+
* - 'high': Can use all quota slots (reserved for critical games like song-quiz)
|
|
716
|
+
* - 'low': Limited to non-reserved slots (default for most requests)
|
|
717
|
+
*
|
|
718
|
+
* @default 'low'
|
|
719
|
+
*/
|
|
720
|
+
priority?: 'low' | 'high';
|
|
721
|
+
/**
|
|
722
|
+
* Prefix audio injection mode
|
|
723
|
+
*
|
|
724
|
+
* Controls how prefix audio is handled:
|
|
725
|
+
* - 'none': No prefix audio (default)
|
|
726
|
+
* - 'client': Client sends PREFIX_AUDIO before user audio
|
|
727
|
+
* - 'stored': Server injects stored prefix audio by prefixId
|
|
728
|
+
*
|
|
729
|
+
* @default 'none'
|
|
730
|
+
*/
|
|
731
|
+
prefixMode?: PrefixMode | string;
|
|
732
|
+
/**
|
|
733
|
+
* Stored prefix audio identifier
|
|
734
|
+
*
|
|
735
|
+
* Only used when prefixMode='stored'. The server will look up this ID
|
|
736
|
+
* in the PrefixAudioCache and inject the corresponding audio before
|
|
737
|
+
* user audio is processed.
|
|
738
|
+
*
|
|
739
|
+
* @example 'song_quiz'
|
|
740
|
+
*/
|
|
741
|
+
prefixId?: string;
|
|
742
|
+
/**
|
|
743
|
+
* Prefix text patterns to remove from transcripts
|
|
744
|
+
*
|
|
745
|
+
* Array of prefix text variants that should be stripped from the transcript.
|
|
746
|
+
* This is used when prefix audio is injected and the ASR transcribes both
|
|
747
|
+
* the prefix and user speech - we remove the prefix portion.
|
|
748
|
+
*
|
|
749
|
+
* Multiple variants are supported because ASR may transcribe contractions
|
|
750
|
+
* differently (e.g., "What's this song" vs "What is this song").
|
|
751
|
+
*
|
|
752
|
+
* Matching rules:
|
|
753
|
+
* - Case insensitive
|
|
754
|
+
* - Leading/trailing whitespace trimmed
|
|
755
|
+
* - Multiple spaces collapsed
|
|
756
|
+
* - Punctuation (?.!,) stripped for matching
|
|
757
|
+
* - Apostrophes preserved (part of contractions)
|
|
758
|
+
*
|
|
759
|
+
* Can be set via:
|
|
760
|
+
* - Server-side game config (production)
|
|
761
|
+
* - Client-side ASRRequest (testing/override) - takes precedence
|
|
762
|
+
*
|
|
763
|
+
* @example ["What's this song", "What is this song"]
|
|
764
|
+
*/
|
|
765
|
+
prefixTextToRemove?: string[];
|
|
584
766
|
/**
|
|
585
767
|
* Additional provider-specific options
|
|
586
768
|
*
|
|
@@ -1040,6 +1222,26 @@ interface IRecognitionClient {
|
|
|
1040
1222
|
* @returns WebSocket URL string
|
|
1041
1223
|
*/
|
|
1042
1224
|
getUrl(): string;
|
|
1225
|
+
/**
|
|
1226
|
+
* Send game context after connection is established (for preconnect flow).
|
|
1227
|
+
*
|
|
1228
|
+
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
1229
|
+
* WITHOUT gameContext → call connect() → WS opens, ASRRequest sent, server
|
|
1230
|
+
* waits in PENDING_CONTEXT → later call sendGameContext() with slotMap →
|
|
1231
|
+
* server attaches provider and sends READY.
|
|
1232
|
+
*
|
|
1233
|
+
* This enables connecting early (before slotMap is known) and sending
|
|
1234
|
+
* game context later when question data is available.
|
|
1235
|
+
*
|
|
1236
|
+
* @param context - Game context including slotMap for keyword boosting
|
|
1237
|
+
*/
|
|
1238
|
+
sendGameContext(context: GameContextV1): void;
|
|
1239
|
+
/**
|
|
1240
|
+
* Check if server has sent READY signal (provider is connected and ready for audio).
|
|
1241
|
+
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
1242
|
+
* @returns true if server is ready to receive audio
|
|
1243
|
+
*/
|
|
1244
|
+
isServerReady(): boolean;
|
|
1043
1245
|
}
|
|
1044
1246
|
/**
|
|
1045
1247
|
* Client statistics interface
|
|
@@ -1114,8 +1316,11 @@ type TranscriptionResult = TranscriptionResultV1;
|
|
|
1114
1316
|
*/
|
|
1115
1317
|
declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
|
|
1116
1318
|
private static readonly PROTOCOL_VERSION;
|
|
1319
|
+
private static readonly MAX_PREFIX_BUFFER_BYTES;
|
|
1117
1320
|
private config;
|
|
1118
1321
|
private audioBuffer;
|
|
1322
|
+
private prefixBuffer;
|
|
1323
|
+
private prefixBufferBytes;
|
|
1119
1324
|
private messageHandler;
|
|
1120
1325
|
private state;
|
|
1121
1326
|
private connectionPromise;
|
|
@@ -1160,6 +1365,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1160
1365
|
isStopping(): boolean;
|
|
1161
1366
|
isTranscriptionFinished(): boolean;
|
|
1162
1367
|
isBufferOverflowing(): boolean;
|
|
1368
|
+
isServerReady(): boolean;
|
|
1369
|
+
sendGameContext(context: GameContextV1): void;
|
|
1163
1370
|
getStats(): IRecognitionClientStats;
|
|
1164
1371
|
protected onConnected(): void;
|
|
1165
1372
|
protected onDisconnected(code: number, reason: string): void;
|
|
@@ -1183,6 +1390,28 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1183
1390
|
* @param audioData - Audio data to send
|
|
1184
1391
|
*/
|
|
1185
1392
|
private sendAudioNow;
|
|
1393
|
+
/**
|
|
1394
|
+
* Send prefix audio to the server.
|
|
1395
|
+
* Prefix audio is sent before user audio and is used for context/priming.
|
|
1396
|
+
* The server will process it but adjust timing so transcripts reflect user audio timing.
|
|
1397
|
+
*
|
|
1398
|
+
* Note: Prefix audio is buffered until READY state, then flushed before user audio.
|
|
1399
|
+
* This ensures proper ordering even if called before server is ready.
|
|
1400
|
+
*
|
|
1401
|
+
* @param audioData - Prefix audio data (ArrayBuffer, ArrayBufferView, or Blob)
|
|
1402
|
+
*/
|
|
1403
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1404
|
+
/**
|
|
1405
|
+
* Internal method to handle prefix audio with buffering
|
|
1406
|
+
* Buffers if not READY, sends immediately if READY
|
|
1407
|
+
*/
|
|
1408
|
+
private sendPrefixAudioInternal;
|
|
1409
|
+
/**
|
|
1410
|
+
* Send prefix audio immediately to the server (without buffering)
|
|
1411
|
+
* Uses encoding offset to mark as prefix audio
|
|
1412
|
+
* @param audioData - Prefix audio data to send
|
|
1413
|
+
*/
|
|
1414
|
+
private sendPrefixAudioNow;
|
|
1186
1415
|
}
|
|
1187
1416
|
|
|
1188
1417
|
export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };
|