@volley/recognition-client-sdk 0.1.424 → 0.1.621
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +233 -7
- package/dist/index.bundled.d.ts +342 -10
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +242 -10
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +231 -10
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +28 -1
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +20 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +17 -0
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/package.json +7 -7
- package/src/index.ts +2 -0
- package/src/recognition-client.ts +154 -4
- package/src/recognition-client.types.ts +23 -0
- package/src/simplified-vgf-recognition-client.integration.spec.ts +15 -3
- package/src/simplified-vgf-recognition-client.ts +28 -1
- package/src/utils/audio-ring-buffer.spec.ts +335 -0
|
@@ -15,7 +15,11 @@ declare enum RecognitionProvider {
|
|
|
15
15
|
GOOGLE = "google",
|
|
16
16
|
GEMINI_BATCH = "gemini-batch",
|
|
17
17
|
OPENAI_BATCH = "openai-batch",
|
|
18
|
-
OPENAI_REALTIME = "openai-realtime"
|
|
18
|
+
OPENAI_REALTIME = "openai-realtime",
|
|
19
|
+
MISTRAL_VOXTRAL = "mistral-voxtral",
|
|
20
|
+
DASHSCOPE = "dashscope",
|
|
21
|
+
TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
|
|
22
|
+
TEST_ASR_STREAMING = "test-asr-streaming"
|
|
19
23
|
}
|
|
20
24
|
/**
|
|
21
25
|
* ASR API type - distinguishes between streaming and file-based transcription APIs
|
|
@@ -77,14 +81,31 @@ declare enum ElevenLabsModel {
|
|
|
77
81
|
* OpenAI Realtime API transcription models
|
|
78
82
|
* These are the verified `input_audio_transcription.model` values.
|
|
79
83
|
* @see https://platform.openai.com/docs/guides/realtime
|
|
84
|
+
* @see https://platform.openai.com/docs/models/gpt-4o-transcribe
|
|
80
85
|
*/
|
|
81
86
|
declare enum OpenAIRealtimeModel {
|
|
87
|
+
GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
|
|
82
88
|
GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
|
|
83
89
|
}
|
|
90
|
+
/**
|
|
91
|
+
* Mistral Voxtral Realtime transcription models
|
|
92
|
+
* @see https://docs.mistral.ai/models/voxtral-mini-transcribe-realtime-26-02
|
|
93
|
+
*/
|
|
94
|
+
declare enum MistralVoxtralModel {
|
|
95
|
+
VOXTRAL_MINI_REALTIME_2602 = "voxtral-mini-transcribe-realtime-2602"
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* DashScope Qwen-ASR Realtime transcription models
|
|
99
|
+
* @see https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition
|
|
100
|
+
*/
|
|
101
|
+
declare enum DashScopeModel {
|
|
102
|
+
QWEN3_ASR_FLASH_REALTIME_2602 = "qwen3-asr-flash-realtime-2026-02-10",
|
|
103
|
+
QWEN3_ASR_FLASH_REALTIME = "qwen3-asr-flash-realtime"
|
|
104
|
+
}
|
|
84
105
|
/**
|
|
85
106
|
* Type alias for any model from any provider
|
|
86
107
|
*/
|
|
87
|
-
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
|
|
108
|
+
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | DashScopeModel | string;
|
|
88
109
|
|
|
89
110
|
/**
|
|
90
111
|
* Audio encoding types
|
|
@@ -230,8 +251,10 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
230
251
|
type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
|
|
231
252
|
audioUtteranceId: z.ZodString;
|
|
232
253
|
finalTranscript: z.ZodString;
|
|
254
|
+
finalTranscriptRaw: z.ZodString;
|
|
233
255
|
finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
|
|
234
256
|
pendingTranscript: z.ZodOptional<z.ZodString>;
|
|
257
|
+
pendingTranscriptRaw: z.ZodOptional<z.ZodString>;
|
|
235
258
|
pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
|
|
236
259
|
is_finished: z.ZodBoolean;
|
|
237
260
|
voiceStart: z.ZodOptional<z.ZodNumber>;
|
|
@@ -241,13 +264,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
241
264
|
endTimestamp: z.ZodOptional<z.ZodNumber>;
|
|
242
265
|
receivedAtMs: z.ZodOptional<z.ZodNumber>;
|
|
243
266
|
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
267
|
+
rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
244
268
|
}, "strip", z.ZodTypeAny, {
|
|
245
269
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
246
270
|
audioUtteranceId: string;
|
|
247
271
|
finalTranscript: string;
|
|
272
|
+
finalTranscriptRaw: string;
|
|
248
273
|
is_finished: boolean;
|
|
249
274
|
finalTranscriptConfidence?: number | undefined;
|
|
250
275
|
pendingTranscript?: string | undefined;
|
|
276
|
+
pendingTranscriptRaw?: string | undefined;
|
|
251
277
|
pendingTranscriptConfidence?: number | undefined;
|
|
252
278
|
voiceStart?: number | undefined;
|
|
253
279
|
voiceDuration?: number | undefined;
|
|
@@ -256,13 +282,16 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
256
282
|
endTimestamp?: number | undefined;
|
|
257
283
|
receivedAtMs?: number | undefined;
|
|
258
284
|
accumulatedAudioTimeMs?: number | undefined;
|
|
285
|
+
rawAudioTimeMs?: number | undefined;
|
|
259
286
|
}, {
|
|
260
287
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
261
288
|
audioUtteranceId: string;
|
|
262
289
|
finalTranscript: string;
|
|
290
|
+
finalTranscriptRaw: string;
|
|
263
291
|
is_finished: boolean;
|
|
264
292
|
finalTranscriptConfidence?: number | undefined;
|
|
265
293
|
pendingTranscript?: string | undefined;
|
|
294
|
+
pendingTranscriptRaw?: string | undefined;
|
|
266
295
|
pendingTranscriptConfidence?: number | undefined;
|
|
267
296
|
voiceStart?: number | undefined;
|
|
268
297
|
voiceDuration?: number | undefined;
|
|
@@ -271,6 +300,7 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
271
300
|
endTimestamp?: number | undefined;
|
|
272
301
|
receivedAtMs?: number | undefined;
|
|
273
302
|
accumulatedAudioTimeMs?: number | undefined;
|
|
303
|
+
rawAudioTimeMs?: number | undefined;
|
|
274
304
|
}>;
|
|
275
305
|
type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
|
|
276
306
|
/**
|
|
@@ -300,11 +330,22 @@ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
|
|
|
300
330
|
* - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
|
|
301
331
|
* - EMPTY → recog.client.websocket.transcript.final_empty
|
|
302
332
|
* - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
|
|
333
|
+
* - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
|
|
303
334
|
*/
|
|
304
335
|
declare enum TranscriptOutcomeType {
|
|
305
336
|
WITH_CONTENT = "with_content",
|
|
306
337
|
EMPTY = "empty",
|
|
307
|
-
NEVER_SENT = "never_sent"
|
|
338
|
+
NEVER_SENT = "never_sent",
|
|
339
|
+
ERROR_AUTHENTICATION = "error_authentication",
|
|
340
|
+
ERROR_VALIDATION = "error_validation",
|
|
341
|
+
ERROR_PROVIDER = "error_provider",
|
|
342
|
+
ERROR_TIMEOUT = "error_timeout",
|
|
343
|
+
ERROR_QUOTA = "error_quota",
|
|
344
|
+
ERROR_INTERNAL_QUOTA = "error_internal_quota",
|
|
345
|
+
ERROR_CONNECTION = "error_connection",
|
|
346
|
+
ERROR_NO_AUDIO = "error_no_audio",
|
|
347
|
+
ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
|
|
348
|
+
ERROR_UNKNOWN = "error_unknown"
|
|
308
349
|
}
|
|
309
350
|
/**
|
|
310
351
|
* Metadata result V1 - contains metadata, timing information, and ASR config
|
|
@@ -314,6 +355,7 @@ declare enum TranscriptOutcomeType {
|
|
|
314
355
|
declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
315
356
|
type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
|
|
316
357
|
audioUtteranceId: z.ZodString;
|
|
358
|
+
connectionInitiatedAtMs: z.ZodOptional<z.ZodNumber>;
|
|
317
359
|
recordingStartMs: z.ZodOptional<z.ZodNumber>;
|
|
318
360
|
recordingEndMs: z.ZodOptional<z.ZodNumber>;
|
|
319
361
|
transcriptEndMs: z.ZodOptional<z.ZodNumber>;
|
|
@@ -321,14 +363,53 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
321
363
|
duration: z.ZodOptional<z.ZodNumber>;
|
|
322
364
|
volume: z.ZodOptional<z.ZodNumber>;
|
|
323
365
|
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
366
|
+
rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
324
367
|
costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
|
|
325
368
|
apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
|
|
326
369
|
asrConfig: z.ZodOptional<z.ZodString>;
|
|
327
370
|
rawAsrMetadata: z.ZodOptional<z.ZodString>;
|
|
328
371
|
transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
|
|
372
|
+
audioMetrics: z.ZodOptional<z.ZodObject<{
|
|
373
|
+
valid: z.ZodBoolean;
|
|
374
|
+
audioBeginMs: z.ZodNumber;
|
|
375
|
+
audioEndMs: z.ZodNumber;
|
|
376
|
+
maxVolume: z.ZodNumber;
|
|
377
|
+
minVolume: z.ZodNumber;
|
|
378
|
+
avgVolume: z.ZodNumber;
|
|
379
|
+
silenceRatio: z.ZodNumber;
|
|
380
|
+
clippingRatio: z.ZodNumber;
|
|
381
|
+
snrEstimate: z.ZodNullable<z.ZodNumber>;
|
|
382
|
+
lastNonSilenceMs: z.ZodNumber;
|
|
383
|
+
timestamp: z.ZodString;
|
|
384
|
+
}, "strip", z.ZodTypeAny, {
|
|
385
|
+
valid: boolean;
|
|
386
|
+
audioBeginMs: number;
|
|
387
|
+
audioEndMs: number;
|
|
388
|
+
maxVolume: number;
|
|
389
|
+
minVolume: number;
|
|
390
|
+
avgVolume: number;
|
|
391
|
+
silenceRatio: number;
|
|
392
|
+
clippingRatio: number;
|
|
393
|
+
snrEstimate: number | null;
|
|
394
|
+
lastNonSilenceMs: number;
|
|
395
|
+
timestamp: string;
|
|
396
|
+
}, {
|
|
397
|
+
valid: boolean;
|
|
398
|
+
audioBeginMs: number;
|
|
399
|
+
audioEndMs: number;
|
|
400
|
+
maxVolume: number;
|
|
401
|
+
minVolume: number;
|
|
402
|
+
avgVolume: number;
|
|
403
|
+
silenceRatio: number;
|
|
404
|
+
clippingRatio: number;
|
|
405
|
+
snrEstimate: number | null;
|
|
406
|
+
lastNonSilenceMs: number;
|
|
407
|
+
timestamp: string;
|
|
408
|
+
}>>;
|
|
329
409
|
}, "strip", z.ZodTypeAny, {
|
|
330
410
|
type: RecognitionResultTypeV1.METADATA;
|
|
331
411
|
audioUtteranceId: string;
|
|
412
|
+
connectionInitiatedAtMs?: number | undefined;
|
|
332
413
|
recordingStartMs?: number | undefined;
|
|
333
414
|
recordingEndMs?: number | undefined;
|
|
334
415
|
transcriptEndMs?: number | undefined;
|
|
@@ -336,14 +417,29 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
336
417
|
duration?: number | undefined;
|
|
337
418
|
volume?: number | undefined;
|
|
338
419
|
accumulatedAudioTimeMs?: number | undefined;
|
|
420
|
+
rawAudioTimeMs?: number | undefined;
|
|
339
421
|
costInUSD?: number | undefined;
|
|
340
422
|
apiType?: ASRApiType | undefined;
|
|
341
423
|
asrConfig?: string | undefined;
|
|
342
424
|
rawAsrMetadata?: string | undefined;
|
|
343
425
|
transcriptOutcome?: TranscriptOutcomeType | undefined;
|
|
426
|
+
audioMetrics?: {
|
|
427
|
+
valid: boolean;
|
|
428
|
+
audioBeginMs: number;
|
|
429
|
+
audioEndMs: number;
|
|
430
|
+
maxVolume: number;
|
|
431
|
+
minVolume: number;
|
|
432
|
+
avgVolume: number;
|
|
433
|
+
silenceRatio: number;
|
|
434
|
+
clippingRatio: number;
|
|
435
|
+
snrEstimate: number | null;
|
|
436
|
+
lastNonSilenceMs: number;
|
|
437
|
+
timestamp: string;
|
|
438
|
+
} | undefined;
|
|
344
439
|
}, {
|
|
345
440
|
type: RecognitionResultTypeV1.METADATA;
|
|
346
441
|
audioUtteranceId: string;
|
|
442
|
+
connectionInitiatedAtMs?: number | undefined;
|
|
347
443
|
recordingStartMs?: number | undefined;
|
|
348
444
|
recordingEndMs?: number | undefined;
|
|
349
445
|
transcriptEndMs?: number | undefined;
|
|
@@ -351,11 +447,25 @@ declare const MetadataResultSchemaV1: z.ZodObject<{
|
|
|
351
447
|
duration?: number | undefined;
|
|
352
448
|
volume?: number | undefined;
|
|
353
449
|
accumulatedAudioTimeMs?: number | undefined;
|
|
450
|
+
rawAudioTimeMs?: number | undefined;
|
|
354
451
|
costInUSD?: number | undefined;
|
|
355
452
|
apiType?: ASRApiType | undefined;
|
|
356
453
|
asrConfig?: string | undefined;
|
|
357
454
|
rawAsrMetadata?: string | undefined;
|
|
358
455
|
transcriptOutcome?: TranscriptOutcomeType | undefined;
|
|
456
|
+
audioMetrics?: {
|
|
457
|
+
valid: boolean;
|
|
458
|
+
audioBeginMs: number;
|
|
459
|
+
audioEndMs: number;
|
|
460
|
+
maxVolume: number;
|
|
461
|
+
minVolume: number;
|
|
462
|
+
avgVolume: number;
|
|
463
|
+
silenceRatio: number;
|
|
464
|
+
clippingRatio: number;
|
|
465
|
+
snrEstimate: number | null;
|
|
466
|
+
lastNonSilenceMs: number;
|
|
467
|
+
timestamp: string;
|
|
468
|
+
} | undefined;
|
|
359
469
|
}>;
|
|
360
470
|
type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
|
|
361
471
|
/**
|
|
@@ -367,7 +477,10 @@ declare enum ErrorTypeV1 {
|
|
|
367
477
|
PROVIDER_ERROR = "provider_error",
|
|
368
478
|
TIMEOUT_ERROR = "timeout_error",
|
|
369
479
|
QUOTA_EXCEEDED = "quota_exceeded",
|
|
480
|
+
INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
|
|
370
481
|
CONNECTION_ERROR = "connection_error",
|
|
482
|
+
NO_AUDIO_ERROR = "no_audio_error",
|
|
483
|
+
CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
|
|
371
484
|
UNKNOWN_ERROR = "unknown_error"
|
|
372
485
|
}
|
|
373
486
|
/**
|
|
@@ -419,6 +532,15 @@ declare enum ControlSignalTypeV1 {
|
|
|
419
532
|
START_RECORDING = "start_recording",
|
|
420
533
|
STOP_RECORDING = "stop_recording"
|
|
421
534
|
}
|
|
535
|
+
/**
|
|
536
|
+
* Prefix audio mode for ASR Request V1
|
|
537
|
+
* Controls how prefix audio is handled during recognition
|
|
538
|
+
*/
|
|
539
|
+
declare enum PrefixMode {
|
|
540
|
+
NONE = "none",
|
|
541
|
+
CLIENT = "client",
|
|
542
|
+
STORED = "stored"
|
|
543
|
+
}
|
|
422
544
|
/**
|
|
423
545
|
* Game context V1 - contains game state information
|
|
424
546
|
*/
|
|
@@ -476,13 +598,13 @@ declare enum FinalTranscriptStability {
|
|
|
476
598
|
*/
|
|
477
599
|
AGGRESSIVE = "aggressive",
|
|
478
600
|
/**
|
|
479
|
-
* Balanced mode:
|
|
601
|
+
* Balanced mode: 500ms timeout (default)
|
|
480
602
|
* Natural middle ground for most conversational scenarios
|
|
481
603
|
* Use cases: General customer support, tech support, typical voice interactions
|
|
482
604
|
*/
|
|
483
605
|
BALANCED = "balanced",
|
|
484
606
|
/**
|
|
485
|
-
* Conservative mode:
|
|
607
|
+
* Conservative mode: 1000ms timeout
|
|
486
608
|
* Wait longer for providers, optimized for complex/reflective speech
|
|
487
609
|
* Use cases: Healthcare, complex queries, careful thought processes
|
|
488
610
|
*/
|
|
@@ -574,13 +696,70 @@ interface ASRRequestConfig {
|
|
|
574
696
|
* doesn't respond with is_final=true after stopRecording().
|
|
575
697
|
*
|
|
576
698
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
577
|
-
* - balanced:
|
|
578
|
-
* - conservative:
|
|
699
|
+
* - balanced: 500ms - current default, good for most cases
|
|
700
|
+
* - conservative: 1000ms - wait longer for complex utterances
|
|
579
701
|
*
|
|
580
702
|
* @default 'balanced'
|
|
581
703
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
582
704
|
*/
|
|
583
705
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
706
|
+
/**
|
|
707
|
+
* Traffic control priority for quota slot allocation
|
|
708
|
+
*
|
|
709
|
+
* Controls which quota slots this request can use when traffic control is enabled.
|
|
710
|
+
* The quota system reserves a portion of slots for high-priority requests.
|
|
711
|
+
*
|
|
712
|
+
* - 'high': Can use all quota slots (reserved for critical games like song-quiz)
|
|
713
|
+
* - 'low': Limited to non-reserved slots (default for most requests)
|
|
714
|
+
*
|
|
715
|
+
* @default 'low'
|
|
716
|
+
*/
|
|
717
|
+
priority?: 'low' | 'high';
|
|
718
|
+
/**
|
|
719
|
+
* Prefix audio injection mode
|
|
720
|
+
*
|
|
721
|
+
* Controls how prefix audio is handled:
|
|
722
|
+
* - 'none': No prefix audio (default)
|
|
723
|
+
* - 'client': Client sends PREFIX_AUDIO before user audio
|
|
724
|
+
* - 'stored': Server injects stored prefix audio by prefixId
|
|
725
|
+
*
|
|
726
|
+
* @default 'none'
|
|
727
|
+
*/
|
|
728
|
+
prefixMode?: PrefixMode | string;
|
|
729
|
+
/**
|
|
730
|
+
* Stored prefix audio identifier
|
|
731
|
+
*
|
|
732
|
+
* Only used when prefixMode='stored'. The server will look up this ID
|
|
733
|
+
* in the PrefixAudioCache and inject the corresponding audio before
|
|
734
|
+
* user audio is processed.
|
|
735
|
+
*
|
|
736
|
+
* @example 'song_quiz'
|
|
737
|
+
*/
|
|
738
|
+
prefixId?: string;
|
|
739
|
+
/**
|
|
740
|
+
* Prefix text patterns to remove from transcripts
|
|
741
|
+
*
|
|
742
|
+
* Array of prefix text variants that should be stripped from the transcript.
|
|
743
|
+
* This is used when prefix audio is injected and the ASR transcribes both
|
|
744
|
+
* the prefix and user speech - we remove the prefix portion.
|
|
745
|
+
*
|
|
746
|
+
* Multiple variants are supported because ASR may transcribe contractions
|
|
747
|
+
* differently (e.g., "What's this song" vs "What is this song").
|
|
748
|
+
*
|
|
749
|
+
* Matching rules:
|
|
750
|
+
* - Case insensitive
|
|
751
|
+
* - Leading/trailing whitespace trimmed
|
|
752
|
+
* - Multiple spaces collapsed
|
|
753
|
+
* - Punctuation (?.!,) stripped for matching
|
|
754
|
+
* - Apostrophes preserved (part of contractions)
|
|
755
|
+
*
|
|
756
|
+
* Can be set via:
|
|
757
|
+
* - Server-side game config (production)
|
|
758
|
+
* - Client-side ASRRequest (testing/override) - takes precedence
|
|
759
|
+
*
|
|
760
|
+
* @example ["What's this song", "What is this song"]
|
|
761
|
+
*/
|
|
762
|
+
prefixTextToRemove?: string[];
|
|
584
763
|
/**
|
|
585
764
|
* Additional provider-specific options
|
|
586
765
|
*
|
|
@@ -1040,6 +1219,26 @@ interface IRecognitionClient {
|
|
|
1040
1219
|
* @returns WebSocket URL string
|
|
1041
1220
|
*/
|
|
1042
1221
|
getUrl(): string;
|
|
1222
|
+
/**
|
|
1223
|
+
* Send game context after connection is established (for preconnect flow).
|
|
1224
|
+
*
|
|
1225
|
+
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
1226
|
+
* WITHOUT gameContext → call connect() → WS opens, ASRRequest sent, server
|
|
1227
|
+
* waits in PENDING_CONTEXT → later call sendGameContext() with slotMap →
|
|
1228
|
+
* server attaches provider and sends READY.
|
|
1229
|
+
*
|
|
1230
|
+
* This enables connecting early (before slotMap is known) and sending
|
|
1231
|
+
* game context later when question data is available.
|
|
1232
|
+
*
|
|
1233
|
+
* @param context - Game context including slotMap for keyword boosting
|
|
1234
|
+
*/
|
|
1235
|
+
sendGameContext(context: GameContextV1): void;
|
|
1236
|
+
/**
|
|
1237
|
+
* Check if server has sent READY signal (provider is connected and ready for audio).
|
|
1238
|
+
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
1239
|
+
* @returns true if server is ready to receive audio
|
|
1240
|
+
*/
|
|
1241
|
+
isServerReady(): boolean;
|
|
1043
1242
|
}
|
|
1044
1243
|
/**
|
|
1045
1244
|
* Client statistics interface
|
|
@@ -1114,8 +1313,11 @@ type TranscriptionResult = TranscriptionResultV1;
|
|
|
1114
1313
|
*/
|
|
1115
1314
|
declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
|
|
1116
1315
|
private static readonly PROTOCOL_VERSION;
|
|
1316
|
+
private static readonly MAX_PREFIX_BUFFER_BYTES;
|
|
1117
1317
|
private config;
|
|
1118
1318
|
private audioBuffer;
|
|
1319
|
+
private prefixBuffer;
|
|
1320
|
+
private prefixBufferBytes;
|
|
1119
1321
|
private messageHandler;
|
|
1120
1322
|
private state;
|
|
1121
1323
|
private connectionPromise;
|
|
@@ -1160,6 +1362,8 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1160
1362
|
isStopping(): boolean;
|
|
1161
1363
|
isTranscriptionFinished(): boolean;
|
|
1162
1364
|
isBufferOverflowing(): boolean;
|
|
1365
|
+
isServerReady(): boolean;
|
|
1366
|
+
sendGameContext(context: GameContextV1): void;
|
|
1163
1367
|
getStats(): IRecognitionClientStats;
|
|
1164
1368
|
protected onConnected(): void;
|
|
1165
1369
|
protected onDisconnected(code: number, reason: string): void;
|
|
@@ -1183,6 +1387,28 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1183
1387
|
* @param audioData - Audio data to send
|
|
1184
1388
|
*/
|
|
1185
1389
|
private sendAudioNow;
|
|
1390
|
+
/**
|
|
1391
|
+
* Send prefix audio to the server.
|
|
1392
|
+
* Prefix audio is sent before user audio and is used for context/priming.
|
|
1393
|
+
* The server will process it but adjust timing so transcripts reflect user audio timing.
|
|
1394
|
+
*
|
|
1395
|
+
* Note: Prefix audio is buffered until READY state, then flushed before user audio.
|
|
1396
|
+
* This ensures proper ordering even if called before server is ready.
|
|
1397
|
+
*
|
|
1398
|
+
* @param audioData - Prefix audio data (ArrayBuffer, ArrayBufferView, or Blob)
|
|
1399
|
+
*/
|
|
1400
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1401
|
+
/**
|
|
1402
|
+
* Internal method to handle prefix audio with buffering
|
|
1403
|
+
* Buffers if not READY, sends immediately if READY
|
|
1404
|
+
*/
|
|
1405
|
+
private sendPrefixAudioInternal;
|
|
1406
|
+
/**
|
|
1407
|
+
* Send prefix audio immediately to the server (without buffering)
|
|
1408
|
+
* Uses encoding offset to mark as prefix audio
|
|
1409
|
+
* @param audioData - Prefix audio data to send
|
|
1410
|
+
*/
|
|
1411
|
+
private sendPrefixAudioNow;
|
|
1186
1412
|
}
|
|
1187
1413
|
|
|
1188
1414
|
export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };
|