@volley/recognition-client-sdk 0.1.200

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1039 @@
1
+ import { z } from 'zod';
2
+
3
+ /**
4
+ * Provider types and enums for recognition services
5
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
6
+ */
7
+ /**
8
+ * Supported speech recognition providers
9
+ */
10
+ declare enum RecognitionProvider {
11
+ ASSEMBLYAI = "assemblyai",
12
+ DEEPGRAM = "deepgram",
13
+ GOOGLE = "google",
14
+ GEMINI_BATCH = "gemini-batch",
15
+ OPENAI_BATCH = "openai-batch"
16
+ }
17
+ /**
18
+ * Deepgram model names
19
+ */
20
+ declare enum DeepgramModel {
21
+ NOVA_2 = "nova-2",
22
+ NOVA_3 = "nova-3",
23
+ FLUX_GENERAL_EN = "flux-general-en"
24
+ }
25
+ /**
26
+ * Google Cloud Speech models
27
+ * @see https://cloud.google.com/speech-to-text/docs/transcription-model
28
+ */
29
+ declare enum GoogleModel {
30
+ LATEST_LONG = "latest_long",
31
+ LATEST_SHORT = "latest_short",
32
+ TELEPHONY = "telephony",
33
+ TELEPHONY_SHORT = "telephony_short",
34
+ MEDICAL_DICTATION = "medical_dictation",
35
+ MEDICAL_CONVERSATION = "medical_conversation",
36
+ DEFAULT = "default",
37
+ COMMAND_AND_SEARCH = "command_and_search",
38
+ PHONE_CALL = "phone_call",
39
+ VIDEO = "video"
40
+ }
41
+ /**
42
+ * Type alias for any model from any provider
43
+ */
44
+ type RecognitionModel = DeepgramModel | GoogleModel | string;
45
+
46
+ /**
47
+ * Audio encoding types
48
+ */
49
+ declare enum AudioEncoding {
50
+ ENCODING_UNSPECIFIED = 0,
51
+ LINEAR16 = 1,
52
+ OGG_OPUS = 2,
53
+ FLAC = 3,
54
+ MULAW = 4,
55
+ ALAW = 5
56
+ }
57
+ declare namespace AudioEncoding {
58
+ /**
59
+ * Convert numeric ID to AudioEncoding enum
60
+ * @param id - Numeric encoding identifier (0-5)
61
+ * @returns AudioEncoding enum value or undefined if invalid
62
+ */
63
+ function fromId(id: number): AudioEncoding | undefined;
64
+ /**
65
+ * Convert string name to AudioEncoding enum
66
+ * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
67
+ * @returns AudioEncoding enum value or undefined if invalid
68
+ */
69
+ function fromName(nameStr: string): AudioEncoding | undefined;
70
+ /**
71
+ * Convert AudioEncoding enum to numeric ID
72
+ * @param encoding - AudioEncoding enum value
73
+ * @returns Numeric ID (0-5)
74
+ */
75
+ function toId(encoding: AudioEncoding): number;
76
+ /**
77
+ * Convert AudioEncoding enum to string name
78
+ * @param encoding - AudioEncoding enum value
79
+ * @returns String name like "LINEAR16", "MULAW", etc.
80
+ */
81
+ function toName(encoding: AudioEncoding): string;
82
+ /**
83
+ * Check if a numeric ID is a valid encoding
84
+ * @param id - Numeric identifier to validate
85
+ * @returns true if valid encoding ID
86
+ */
87
+ function isIdValid(id: number): boolean;
88
+ /**
89
+ * Check if a string name is a valid encoding
90
+ * @param nameStr - String name to validate
91
+ * @returns true if valid encoding name
92
+ */
93
+ function isNameValid(nameStr: string): boolean;
94
+ }
95
+ /**
96
+ * Common sample rates (in Hz)
97
+ */
98
+ declare enum SampleRate {
99
+ RATE_8000 = 8000,
100
+ RATE_16000 = 16000,
101
+ RATE_22050 = 22050,
102
+ RATE_24000 = 24000,
103
+ RATE_32000 = 32000,
104
+ RATE_44100 = 44100,
105
+ RATE_48000 = 48000
106
+ }
107
+ declare namespace SampleRate {
108
+ /**
109
+ * Convert Hz value to SampleRate enum
110
+ * @param hz - Sample rate in Hz (8000, 16000, etc.)
111
+ * @returns SampleRate enum value or undefined if invalid
112
+ */
113
+ function fromHz(hz: number): SampleRate | undefined;
114
+ /**
115
+ * Convert string name to SampleRate enum
116
+ * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
117
+ * @returns SampleRate enum value or undefined if invalid
118
+ */
119
+ function fromName(nameStr: string): SampleRate | undefined;
120
+ /**
121
+ * Convert SampleRate enum to Hz value
122
+ * @param rate - SampleRate enum value
123
+ * @returns Hz value (8000, 16000, etc.)
124
+ */
125
+ function toHz(rate: SampleRate): number;
126
+ /**
127
+ * Convert SampleRate enum to string name
128
+ * @param rate - SampleRate enum value
129
+ * @returns String name like "RATE_8000", "RATE_16000", etc.
130
+ */
131
+ function toName(rate: SampleRate): string;
132
+ /**
133
+ * Check if a numeric Hz value is a valid sample rate
134
+ * @param hz - Hz value to validate
135
+ * @returns true if valid sample rate
136
+ */
137
+ function isHzValid(hz: number): boolean;
138
+ /**
139
+ * Check if a string name is a valid sample rate
140
+ * @param nameStr - String name to validate
141
+ * @returns true if valid sample rate name
142
+ */
143
+ function isNameValid(nameStr: string): boolean;
144
+ }
145
+ /**
146
+ * Supported languages for recognition
147
+ * Using BCP-47 language tags
148
+ */
149
+ declare enum Language {
150
+ ENGLISH_US = "en-US",
151
+ ENGLISH_GB = "en-GB",
152
+ SPANISH_ES = "es-ES",
153
+ SPANISH_MX = "es-MX",
154
+ FRENCH_FR = "fr-FR",
155
+ GERMAN_DE = "de-DE",
156
+ ITALIAN_IT = "it-IT",
157
+ PORTUGUESE_BR = "pt-BR",
158
+ JAPANESE_JP = "ja-JP",
159
+ KOREAN_KR = "ko-KR",
160
+ CHINESE_CN = "zh-CN",
161
+ CHINESE_TW = "zh-TW"
162
+ }
163
+
164
+ /**
165
+ * Recognition Result Types V1
166
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
167
+ * Types and schemas for recognition results sent to SDK clients
168
+ */
169
+
170
+ /**
171
+ * Message type discriminator for recognition results V1
172
+ */
173
+ declare enum RecognitionResultTypeV1 {
174
+ TRANSCRIPTION = "Transcription",// Transcript message contains all in the history. result of STT(Speech to text)
175
+ FUNCTION_CALL = "FunctionCall",// Not supported in P1.result of STF(Speedch to function call) Function call schema
176
+ METADATA = "Metadata",// Metadata message contains all the timestamps, provider info, and ASR config
177
+ ERROR = "Error",// Error message contains the error details
178
+ CLIENT_CONTROL_MESSAGE = "ClientControlMessage"
179
+ }
180
+ /**
181
+ * Transcription result V1 - contains transcript message
182
+ * In the long run game side should not need to know it. In the short run it is send back to client.
183
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
184
+ */
185
+ declare const TranscriptionResultSchemaV1: z.ZodObject<{
186
+ type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
187
+ audioUtteranceId: z.ZodString;
188
+ finalTranscript: z.ZodString;
189
+ finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
190
+ pendingTranscript: z.ZodOptional<z.ZodString>;
191
+ pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
192
+ is_finished: z.ZodBoolean;
193
+ voiceStart: z.ZodOptional<z.ZodNumber>;
194
+ voiceDuration: z.ZodOptional<z.ZodNumber>;
195
+ voiceEnd: z.ZodOptional<z.ZodNumber>;
196
+ startTimestamp: z.ZodOptional<z.ZodNumber>;
197
+ endTimestamp: z.ZodOptional<z.ZodNumber>;
198
+ receivedAtMs: z.ZodOptional<z.ZodNumber>;
199
+ accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
200
+ }, "strip", z.ZodTypeAny, {
201
+ type: RecognitionResultTypeV1.TRANSCRIPTION;
202
+ audioUtteranceId: string;
203
+ finalTranscript: string;
204
+ is_finished: boolean;
205
+ finalTranscriptConfidence?: number | undefined;
206
+ pendingTranscript?: string | undefined;
207
+ pendingTranscriptConfidence?: number | undefined;
208
+ voiceStart?: number | undefined;
209
+ voiceDuration?: number | undefined;
210
+ voiceEnd?: number | undefined;
211
+ startTimestamp?: number | undefined;
212
+ endTimestamp?: number | undefined;
213
+ receivedAtMs?: number | undefined;
214
+ accumulatedAudioTimeMs?: number | undefined;
215
+ }, {
216
+ type: RecognitionResultTypeV1.TRANSCRIPTION;
217
+ audioUtteranceId: string;
218
+ finalTranscript: string;
219
+ is_finished: boolean;
220
+ finalTranscriptConfidence?: number | undefined;
221
+ pendingTranscript?: string | undefined;
222
+ pendingTranscriptConfidence?: number | undefined;
223
+ voiceStart?: number | undefined;
224
+ voiceDuration?: number | undefined;
225
+ voiceEnd?: number | undefined;
226
+ startTimestamp?: number | undefined;
227
+ endTimestamp?: number | undefined;
228
+ receivedAtMs?: number | undefined;
229
+ accumulatedAudioTimeMs?: number | undefined;
230
+ }>;
231
+ type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
232
+ /**
233
+ * Function call result V1 - similar to LLM function call
234
+ * In the long run game server should know it, rather than TV or client.
235
+ */
236
+ declare const FunctionCallResultSchemaV1: z.ZodObject<{
237
+ type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>;
238
+ audioUtteranceId: z.ZodString;
239
+ functionName: z.ZodString;
240
+ functionArgJson: z.ZodString;
241
+ }, "strip", z.ZodTypeAny, {
242
+ type: RecognitionResultTypeV1.FUNCTION_CALL;
243
+ audioUtteranceId: string;
244
+ functionName: string;
245
+ functionArgJson: string;
246
+ }, {
247
+ type: RecognitionResultTypeV1.FUNCTION_CALL;
248
+ audioUtteranceId: string;
249
+ functionName: string;
250
+ functionArgJson: string;
251
+ }>;
252
+ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
253
+ /**
254
+ * Metadata result V1 - contains metadata, timing information, and ASR config
255
+ * Sent when the provider connection closes to provide final timing metrics and config
256
+ * In the long run game server should know it, rather than TV or client.
257
+ */
258
+ declare const MetadataResultSchemaV1: z.ZodObject<{
259
+ type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
260
+ audioUtteranceId: z.ZodString;
261
+ recordingStartMs: z.ZodOptional<z.ZodNumber>;
262
+ recordingEndMs: z.ZodOptional<z.ZodNumber>;
263
+ transcriptEndMs: z.ZodOptional<z.ZodNumber>;
264
+ socketCloseAtMs: z.ZodOptional<z.ZodNumber>;
265
+ duration: z.ZodOptional<z.ZodNumber>;
266
+ volume: z.ZodOptional<z.ZodNumber>;
267
+ accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
268
+ costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
269
+ asrConfig: z.ZodOptional<z.ZodString>;
270
+ rawAsrMetadata: z.ZodOptional<z.ZodString>;
271
+ }, "strip", z.ZodTypeAny, {
272
+ type: RecognitionResultTypeV1.METADATA;
273
+ audioUtteranceId: string;
274
+ accumulatedAudioTimeMs?: number | undefined;
275
+ recordingStartMs?: number | undefined;
276
+ recordingEndMs?: number | undefined;
277
+ transcriptEndMs?: number | undefined;
278
+ socketCloseAtMs?: number | undefined;
279
+ duration?: number | undefined;
280
+ volume?: number | undefined;
281
+ costInUSD?: number | undefined;
282
+ asrConfig?: string | undefined;
283
+ rawAsrMetadata?: string | undefined;
284
+ }, {
285
+ type: RecognitionResultTypeV1.METADATA;
286
+ audioUtteranceId: string;
287
+ accumulatedAudioTimeMs?: number | undefined;
288
+ recordingStartMs?: number | undefined;
289
+ recordingEndMs?: number | undefined;
290
+ transcriptEndMs?: number | undefined;
291
+ socketCloseAtMs?: number | undefined;
292
+ duration?: number | undefined;
293
+ volume?: number | undefined;
294
+ costInUSD?: number | undefined;
295
+ asrConfig?: string | undefined;
296
+ rawAsrMetadata?: string | undefined;
297
+ }>;
298
+ type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
299
+ /**
300
+ * Error type enum V1 - categorizes different types of errors
301
+ */
302
+ declare enum ErrorTypeV1 {
303
+ AUTHENTICATION_ERROR = "authentication_error",// Authentication/authorization failures
304
+ VALIDATION_ERROR = "validation_error",// Invalid input or configuration
305
+ PROVIDER_ERROR = "provider_error",// Error from ASR provider (Deepgram, Google, etc.) Unlikely to happen with fall
306
+ TIMEOUT_ERROR = "timeout_error",// Request or operation timeout. Likely business logic did not handle timeout.
307
+ QUOTA_EXCEEDED = "quota_exceeded",// Quota or rate limit exceeded. Unlikely to happen with fallbakcs
308
+ UNKNOWN_ERROR = "unknown_error"
309
+ }
310
+ /**
311
+ * Error result V1 - contains error message
312
+ * In the long run game server should know it, rather than TV or client.
313
+ */
314
+ declare const ErrorResultSchemaV1: z.ZodObject<{
315
+ type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>;
316
+ audioUtteranceId: z.ZodString;
317
+ errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>;
318
+ message: z.ZodOptional<z.ZodString>;
319
+ code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
320
+ description: z.ZodOptional<z.ZodString>;
321
+ }, "strip", z.ZodTypeAny, {
322
+ type: RecognitionResultTypeV1.ERROR;
323
+ audioUtteranceId: string;
324
+ code?: string | number | undefined;
325
+ message?: string | undefined;
326
+ errorType?: ErrorTypeV1 | undefined;
327
+ description?: string | undefined;
328
+ }, {
329
+ type: RecognitionResultTypeV1.ERROR;
330
+ audioUtteranceId: string;
331
+ code?: string | number | undefined;
332
+ message?: string | undefined;
333
+ errorType?: ErrorTypeV1 | undefined;
334
+ description?: string | undefined;
335
+ }>;
336
+ type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>;
337
+
338
+ /**
339
+ * Recognition Context Types V1
340
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
341
+ * Types and schemas for recognition context data
342
+ */
343
+
344
+ /**
345
+ * Message type discriminator for recognition context V1
346
+ */
347
+ declare enum RecognitionContextTypeV1 {
348
+ GAME_CONTEXT = "GameContext",
349
+ CONTROL_SIGNAL = "ControlSignal",
350
+ ASR_REQUEST = "ASRRequest"
351
+ }
352
+ /**
353
+ * Control signal types for recognition V1
354
+ */
355
+ declare enum ControlSignalTypeV1 {
356
+ START_RECORDING = "start_recording",
357
+ STOP_RECORDING = "stop_recording"
358
+ }
359
+ /**
360
+ * Game context V1 - contains game state information
361
+ */
362
+ declare const GameContextSchemaV1: z.ZodObject<{
363
+ type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>;
364
+ gameId: z.ZodString;
365
+ gamePhase: z.ZodString;
366
+ promptSTT: z.ZodOptional<z.ZodString>;
367
+ promptSTF: z.ZodOptional<z.ZodString>;
368
+ promptTTF: z.ZodOptional<z.ZodString>;
369
+ slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
370
+ }, "strip", z.ZodTypeAny, {
371
+ type: RecognitionContextTypeV1.GAME_CONTEXT;
372
+ gameId: string;
373
+ gamePhase: string;
374
+ promptSTT?: string | undefined;
375
+ promptSTF?: string | undefined;
376
+ promptTTF?: string | undefined;
377
+ slotMap?: Record<string, string[]> | undefined;
378
+ }, {
379
+ type: RecognitionContextTypeV1.GAME_CONTEXT;
380
+ gameId: string;
381
+ gamePhase: string;
382
+ promptSTT?: string | undefined;
383
+ promptSTF?: string | undefined;
384
+ promptTTF?: string | undefined;
385
+ slotMap?: Record<string, string[]> | undefined;
386
+ }>;
387
+ type GameContextV1 = z.infer<typeof GameContextSchemaV1>;
388
+ /**
389
+ * ASR Request V1 - contains complete ASR setup information
390
+ * Sent once at connection start to configure the session
391
+ */
392
+ declare const ASRRequestSchemaV1: z.ZodObject<{
393
+ type: z.ZodLiteral<RecognitionContextTypeV1.ASR_REQUEST>;
394
+ audioUtteranceId: z.ZodOptional<z.ZodString>;
395
+ provider: z.ZodString;
396
+ model: z.ZodOptional<z.ZodString>;
397
+ language: z.ZodString;
398
+ sampleRate: z.ZodNumber;
399
+ encoding: z.ZodNumber;
400
+ interimResults: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
401
+ useContext: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
402
+ debugCommand: z.ZodOptional<z.ZodObject<{
403
+ enableDebugLog: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
404
+ enableAudioStorage: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
405
+ enableSongQuizSessionIdCheck: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
406
+ enablePilotModels: z.ZodDefault<z.ZodOptional<z.ZodBoolean>>;
407
+ }, "strip", z.ZodTypeAny, {
408
+ enableDebugLog: boolean;
409
+ enableAudioStorage: boolean;
410
+ enableSongQuizSessionIdCheck: boolean;
411
+ enablePilotModels: boolean;
412
+ }, {
413
+ enableDebugLog?: boolean | undefined;
414
+ enableAudioStorage?: boolean | undefined;
415
+ enableSongQuizSessionIdCheck?: boolean | undefined;
416
+ enablePilotModels?: boolean | undefined;
417
+ }>>;
418
+ }, "strip", z.ZodTypeAny, {
419
+ provider: string;
420
+ language: string;
421
+ sampleRate: number;
422
+ encoding: number;
423
+ interimResults: boolean;
424
+ useContext: boolean;
425
+ type: RecognitionContextTypeV1.ASR_REQUEST;
426
+ model?: string | undefined;
427
+ audioUtteranceId?: string | undefined;
428
+ debugCommand?: {
429
+ enableDebugLog: boolean;
430
+ enableAudioStorage: boolean;
431
+ enableSongQuizSessionIdCheck: boolean;
432
+ enablePilotModels: boolean;
433
+ } | undefined;
434
+ }, {
435
+ provider: string;
436
+ language: string;
437
+ sampleRate: number;
438
+ encoding: number;
439
+ type: RecognitionContextTypeV1.ASR_REQUEST;
440
+ model?: string | undefined;
441
+ interimResults?: boolean | undefined;
442
+ useContext?: boolean | undefined;
443
+ audioUtteranceId?: string | undefined;
444
+ debugCommand?: {
445
+ enableDebugLog?: boolean | undefined;
446
+ enableAudioStorage?: boolean | undefined;
447
+ enableSongQuizSessionIdCheck?: boolean | undefined;
448
+ enablePilotModels?: boolean | undefined;
449
+ } | undefined;
450
+ }>;
451
+ type ASRRequestV1 = z.infer<typeof ASRRequestSchemaV1>;
452
+
453
+ /**
454
+ * Unified ASR Request Configuration
455
+ *
456
+ * Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests.
457
+ * This interface provides a consistent API for clients regardless of the underlying provider.
458
+ *
459
+ * All fields use library-defined enums for type safety and consistency.
460
+ * Provider-specific mappers will convert these to provider-native formats.
461
+ */
462
+
463
+ /**
464
+ * Unified ASR request configuration
465
+ *
466
+ * This configuration is used by:
467
+ * - Client SDKs to specify recognition parameters
468
+ * - Demo applications for user input
469
+ * - Service layer to configure provider sessions
470
+ *
471
+ * Core fields only - all provider-specific options go in providerOptions
472
+ *
473
+ * @example
474
+ * ```typescript
475
+ * const config: ASRRequestConfig = {
476
+ * provider: RecognitionProvider.GOOGLE,
477
+ * model: GoogleModel.LATEST_LONG,
478
+ * language: Language.ENGLISH_US,
479
+ * sampleRate: SampleRate.RATE_16000, // or just 16000
480
+ * encoding: AudioEncoding.LINEAR16,
481
+ * providerOptions: {
482
+ * google: {
483
+ * enableAutomaticPunctuation: true,
484
+ * interimResults: true,
485
+ * singleUtterance: false
486
+ * }
487
+ * }
488
+ * };
489
+ * ```
490
+ */
491
+ interface ASRRequestConfig {
492
+ /**
493
+ * The ASR provider to use
494
+ * Must be one of the supported providers in RecognitionProvider enum
495
+ */
496
+ provider: RecognitionProvider | string;
497
+ /**
498
+ * Optional model specification for the provider
499
+ * Can be provider-specific model enum or string
500
+ * If not specified, provider's default model will be used
501
+ */
502
+ model?: RecognitionModel;
503
+ /**
504
+ * Language/locale for recognition
505
+ * Use Language enum for common languages
506
+ * Can also accept BCP-47 language tags as strings
507
+ */
508
+ language: Language | string;
509
+ /**
510
+ * Audio sample rate in Hz
511
+ * Prefer using SampleRate enum values for standard rates
512
+ * Can also accept numeric Hz values (e.g., 16000)
513
+ */
514
+ sampleRate: SampleRate | number;
515
+ /**
516
+ * Audio encoding format
517
+ * Must match the actual audio data being sent
518
+ * Use AudioEncoding enum for standard formats
519
+ */
520
+ encoding: AudioEncoding | string;
521
+ /**
522
+ * Enable interim (partial) results during recognition
523
+ * When true, receive real-time updates before finalization
524
+ * When false, only receive final results
525
+ * Default: false
526
+ */
527
+ interimResults?: boolean;
528
+ /**
529
+ * Require GameContext before starting recognition such as song titles
530
+ * When true, server waits for GameContext message before processing audio
531
+ * When false, recognition starts immediately
532
+ * Default: false
533
+ */
534
+ useContext?: boolean;
535
+ /**
536
+ * Additional provider-specific options
537
+ *
538
+ * Common options per provider:
539
+ * - Deepgram: punctuate, smart_format, diarize, utterances
540
+ * - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets
541
+ * - AssemblyAI: formatTurns, filter_profanity, word_boost
542
+ *
543
+ * Note: interimResults is now a top-level field, but can still be overridden per provider
544
+ *
545
+ * @example
546
+ * ```typescript
547
+ * providerOptions: {
548
+ * google: {
549
+ * enableAutomaticPunctuation: true,
550
+ * singleUtterance: false,
551
+ * enableWordTimeOffsets: false
552
+ * }
553
+ * }
554
+ * ```
555
+ */
556
+ providerOptions?: Record<string, any>;
557
+ /**
558
+ * Optional fallback ASR configurations
559
+ *
560
+ * List of alternative ASR configurations to use if the primary fails.
561
+ * Each fallback config is a complete ASRRequestConfig that will be tried
562
+ * in order until one succeeds.
563
+ *
564
+ * @example
565
+ * ```typescript
566
+ * fallbackModels: [
567
+ * {
568
+ * provider: RecognitionProvider.DEEPGRAM,
569
+ * model: DeepgramModel.NOVA_2,
570
+ * language: Language.ENGLISH_US,
571
+ * sampleRate: 16000,
572
+ * encoding: AudioEncoding.LINEAR16
573
+ * },
574
+ * {
575
+ * provider: RecognitionProvider.GOOGLE,
576
+ * model: GoogleModel.LATEST_SHORT,
577
+ * language: Language.ENGLISH_US,
578
+ * sampleRate: 16000,
579
+ * encoding: AudioEncoding.LINEAR16
580
+ * }
581
+ * ]
582
+ * ```
583
+ */
584
+ fallbackModels?: ASRRequestConfig[];
585
+ }
586
+
587
+ /**
588
+ * Generic WebSocket protocol types and utilities
589
+ * Supports flexible versioning and message types
590
+ * Used by both client and server implementations
591
+ */
592
+
593
+ /**
594
+ * Base message structure - completely flexible
595
+ * @template V - Version type (number, string, etc.)
596
+ */
597
+ interface Message<V = number> {
598
+ v: V;
599
+ type: string;
600
+ data?: unknown;
601
+ }
602
+ /**
603
+ * Version serializer interface
604
+ * Converts between version type V and byte representation
605
+ */
606
+ interface VersionSerializer<V> {
607
+ serialize: (v: V) => number;
608
+ deserialize: (byte: number) => V;
609
+ }
610
+
611
+ /**
612
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
613
+ * Sends audio and control messages, receives responses from server
614
+ *
615
+ * Features:
616
+ * - Generic version type support (number, string, etc.)
617
+ * - Type-safe upward/downward message data
618
+ * - Client-side backpressure monitoring
619
+ * - Abstract hooks for application-specific logic
620
+ * - Format-agnostic audio protocol (supports any encoding)
621
+ */
622
+
623
+ type ClientConfig = {
624
+ url: string;
625
+ highWM?: number;
626
+ lowWM?: number;
627
+ };
628
+ /**
629
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
630
+ * that send audio frames and JSON messages
631
+ *
632
+ * @template V - Version type (number, string, object, etc.)
633
+ * @template TUpward - Type of upward message data (Client -> Server)
634
+ * @template TDownward - Type of downward message data (Server -> Client)
635
+ *
636
+ * @example
637
+ * ```typescript
638
+ * class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> {
639
+ * protected onConnected() {
640
+ * console.log('Connected!');
641
+ * }
642
+ *
643
+ * protected onMessage(msg) {
644
+ * console.log('Received:', msg.type, msg.data);
645
+ * }
646
+ *
647
+ * protected onDisconnected(code, reason) {
648
+ * console.log('Disconnected:', code, reason);
649
+ * }
650
+ *
651
+ * protected onError(error) {
652
+ * console.error('Error:', error);
653
+ * }
654
+ * }
655
+ *
656
+ * const client = new MyClient({ url: 'ws://localhost:8080' });
657
+ * client.connect();
658
+ * client.sendMessage(1, 'configure', { language: 'en' });
659
+ * client.sendAudio(audioData);
660
+ * ```
661
+ */
662
+ declare abstract class WebSocketAudioClient<V = number, // Version type (default: number)
663
+ TUpward = unknown, // Upward message data type
664
+ TDownward = unknown> {
665
+ private cfg;
666
+ protected versionSerializer: VersionSerializer<V>;
667
+ private ws;
668
+ private seq;
669
+ private HWM;
670
+ private LWM;
671
+ constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>);
672
+ /**
673
+ * Hook: Called when WebSocket connection is established
674
+ */
675
+ protected abstract onConnected(): void;
676
+ /**
677
+ * Hook: Called when WebSocket connection closes
678
+ * @param code - Close code (see WebSocketCloseCode enum)
679
+ * @param reason - Human-readable close reason
680
+ */
681
+ protected abstract onDisconnected(code: number, reason: string): void;
682
+ /**
683
+ * Hook: Called when WebSocket error occurs
684
+ */
685
+ protected abstract onError(error: Event): void;
686
+ /**
687
+ * Hook: Called when downward message arrives from server
688
+ * Override this to handle messages (optional - default does nothing)
689
+ */
690
+ protected onMessage(msg: Message<V> & {
691
+ data: TDownward;
692
+ }): void;
693
+ connect(): void;
694
+ /**
695
+ * Send JSON message to server
696
+ * @param version - Message version
697
+ * @param type - Message type (developer defined)
698
+ * @param data - Message payload (typed)
699
+ */
700
+ sendMessage(version: V, type: string, data: TUpward): void;
701
+ /**
702
+ * Send audio frame with specified encoding and sample rate
703
+ * @param audioData - Audio data (any format: Int16Array, Uint8Array, ArrayBuffer, etc.)
704
+ * @param version - Audio frame version
705
+ * @param encodingId - Audio encoding ID (0-5, e.g., AudioEncoding.LINEAR16)
706
+ * @param sampleRate - Sample rate in Hz (e.g., 16000)
707
+ */
708
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView, version: V, encodingId: number, sampleRate: number): void;
709
+ /**
710
+ * Get current WebSocket buffer size
711
+ */
712
+ getBufferedAmount(): number;
713
+ /**
714
+ * Check if local buffer is backpressured
715
+ */
716
+ isLocalBackpressured(): boolean;
717
+ /**
718
+ * Check if ready to send audio
719
+ * Verifies: connection open, no local buffer pressure
720
+ */
721
+ canSend(): boolean;
722
+ /**
723
+ * Check if connection is open
724
+ */
725
+ isOpen(): boolean;
726
+ /**
727
+ * Get current connection state
728
+ */
729
+ getReadyState(): number;
730
+ }
731
+
732
+ /**
733
+ * Recognition Client Types
734
+ *
735
+ * Type definitions and interfaces for the recognition client SDK.
736
+ * These interfaces enable dependency injection, testing, and alternative implementations.
737
+ */
738
+
739
+ /**
740
+ * Client connection state enum
741
+ * Represents the various states a recognition client can be in during its lifecycle
742
+ */
743
+ declare enum ClientState {
744
+ /** Initial state, no connection established */
745
+ INITIAL = "initial",
746
+ /** Actively establishing WebSocket connection */
747
+ CONNECTING = "connecting",
748
+ /** WebSocket connected but waiting for server ready signal */
749
+ CONNECTED = "connected",
750
+ /** Server ready, can send audio */
751
+ READY = "ready",
752
+ /** Sent stop signal, waiting for final transcript */
753
+ STOPPING = "stopping",
754
+ /** Connection closed normally after stop */
755
+ STOPPED = "stopped",
756
+ /** Connection failed or lost unexpectedly */
757
+ FAILED = "failed"
758
+ }
759
+ /**
760
+ * Callback URL configuration with message type filtering
761
+ */
762
+ interface RecognitionCallbackUrl {
763
+ /** The callback URL endpoint */
764
+ url: string;
765
+ /** Array of message types to send to this URL. If empty/undefined, all types are sent */
766
+ messageTypes?: Array<string | number>;
767
+ }
768
+ interface IRecognitionClientConfig {
769
+ /**
770
+ * WebSocket endpoint URL (optional - defaults to production)
771
+ *
772
+ * For different stages, use the helper function:
773
+ * ```typescript
774
+ * import { getRecognitionServiceBase } from '@recog/client-sdk-ts';
775
+ * const base = getRecognitionServiceBase('staging'); // or 'dev', 'production'
776
+ * const url = `${base.wsBase}/ws/v1/recognize`;
777
+ * ```
778
+ */
779
+ url?: string;
780
+ /** ASR configuration (provider, model, language, etc.) - optional */
781
+ asrRequestConfig?: ASRRequestConfig;
782
+ /** Game context for improved recognition accuracy */
783
+ gameContext?: GameContextV1;
784
+ /** Audio utterance ID (optional) - if not provided, a UUID v4 will be generated */
785
+ audioUtteranceId?: string;
786
+ /** Callback URLs for server-side notifications with optional message type filtering (optional)
787
+ * Game side only need to use it if another service need to be notified about the transcription results.
788
+ */
789
+ callbackUrls?: RecognitionCallbackUrl[];
790
+ /** User identification (optional) */
791
+ userId?: string;
792
+ /** Game session identification (optional). called 'sessionId' in Platform and most games. */
793
+ gameSessionId?: string;
794
+ /** Device identification (optional) */
795
+ deviceId?: string;
796
+ /** Account identification (optional) */
797
+ accountId?: string;
798
+ /** Question answer identifier for tracking Q&A sessions (optional and tracking purpose only) */
799
+ questionAnswerId?: string;
800
+ /** Platform for audio recording device (optional, e.g., 'ios', 'android', 'web', 'unity') */
801
+ platform?: string;
802
+ /** Callback when transcript is received */
803
+ onTranscript?: (result: TranscriptionResultV1) => void;
804
+ /**
805
+ * Callback when function call is received
806
+ * Note: Not supported in 2025. P2 feature for future speech-to-function-call capability.
807
+ */
808
+ onFunctionCall?: (result: FunctionCallResultV1) => void;
809
+ /** Callback when metadata is received. Only once after transcription is complete.*/
810
+ onMetadata?: (metadata: MetadataResultV1) => void;
811
+ /** Callback when error occurs */
812
+ onError?: (error: ErrorResultV1) => void;
813
+ /** Callback when connected to WebSocket */
814
+ onConnected?: () => void;
815
+ /**
816
+ * Callback when WebSocket disconnects
817
+ * @param code - WebSocket close code (1000 = normal, 1006 = abnormal, etc.)
818
+ * @param reason - Close reason string
819
+ */
820
+ onDisconnected?: (code: number, reason: string) => void;
821
+ /** High water mark for backpressure control (bytes) */
822
+ highWaterMark?: number;
823
+ /** Low water mark for backpressure control (bytes) */
824
+ lowWaterMark?: number;
825
+ /** Maximum buffer duration in seconds (default: 60s) */
826
+ maxBufferDurationSec?: number;
827
+ /** Expected chunks per second for ring buffer sizing (default: 100) */
828
+ chunksPerSecond?: number;
829
+ /**
830
+ * Optional logger function for debugging
831
+ * If not provided, no logging will occur
832
+ * @param level - Log level: 'debug', 'info', 'warn', 'error'
833
+ * @param message - Log message
834
+ * @param data - Optional additional data
835
+ */
836
+ logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void;
837
+ }
838
+ /**
839
+ * Recognition Client Interface
840
+ *
841
+ * Main interface for real-time speech recognition clients.
842
+ * Provides methods for connection management, audio streaming, and session control.
843
+ */
844
+ interface IRecognitionClient {
845
+ /**
846
+ * Connect to the WebSocket endpoint
847
+ * @returns Promise that resolves when connected
848
+ * @throws Error if connection fails or times out
849
+ */
850
+ connect(): Promise<void>;
851
+ /**
852
+ * Send audio data to the recognition service
853
+ * Audio is buffered locally and sent when connection is ready.
854
+ * @param audioData - PCM audio data as ArrayBuffer or typed array view
855
+ */
856
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
857
+ /**
858
+ * Stop recording and wait for final transcript
859
+ * The server will close the connection after sending the final transcript.
860
+ * @returns Promise that resolves when final transcript is received
861
+ */
862
+ stopRecording(): Promise<void>;
863
+ /**
864
+ * Get the audio utterance ID for this session
865
+ * Available immediately after client construction.
866
+ * @returns UUID v4 string identifying this recognition session
867
+ */
868
+ getAudioUtteranceId(): string;
869
+ /**
870
+ * Get the current state of the client
871
+ * @returns Current ClientState value
872
+ */
873
+ getState(): ClientState;
874
+ /**
875
+ * Check if WebSocket connection is open
876
+ * @returns true if connected and ready to communicate
877
+ */
878
+ isConnected(): boolean;
879
+ /**
880
+ * Check if client is currently connecting
881
+ * @returns true if connection is in progress
882
+ */
883
+ isConnecting(): boolean;
884
+ /**
885
+ * Check if client is currently stopping
886
+ * @returns true if stopRecording() is in progress
887
+ */
888
+ isStopping(): boolean;
889
+ /**
890
+ * Check if transcription has finished
891
+ * @returns true if the transcription is complete
892
+ */
893
+ isTranscriptionFinished(): boolean;
894
+ /**
895
+ * Check if the audio buffer has overflowed
896
+ * @returns true if the ring buffer has wrapped around
897
+ */
898
+ isBufferOverflowing(): boolean;
899
+ /**
900
+ * Get client statistics
901
+ * @returns Statistics about audio transmission and buffering
902
+ */
903
+ getStats(): IRecognitionClientStats;
904
+ }
905
+ /**
906
+ * Client statistics interface
907
+ */
908
+ interface IRecognitionClientStats {
909
+ /** Total audio bytes sent to server */
910
+ audioBytesSent: number;
911
+ /** Total number of audio chunks sent */
912
+ audioChunksSent: number;
913
+ /** Total number of audio chunks buffered */
914
+ audioChunksBuffered: number;
915
+ /** Number of times the ring buffer overflowed */
916
+ bufferOverflowCount: number;
917
+ /** Current number of chunks in buffer */
918
+ currentBufferedChunks: number;
919
+ /** Whether the ring buffer has wrapped (overwritten old data) */
920
+ hasWrapped: boolean;
921
+ }
922
+ /**
923
+ * Configuration for RealTimeTwoWayWebSocketRecognitionClient
924
+ * This extends IRecognitionClientConfig and is the main configuration interface
925
+ * for creating a new RealTimeTwoWayWebSocketRecognitionClient instance.
926
+ */
927
+ interface RealTimeTwoWayWebSocketRecognitionClientConfig extends IRecognitionClientConfig {
928
+ }
929
+
930
+ /**
931
+ * RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition
932
+ *
933
+ * Features:
934
+ * - Ring buffer-based audio storage with fixed memory footprint
935
+ * - Automatic buffering when disconnected, immediate send when connected
936
+ * - Buffer persists after flush (for future retry/reconnection scenarios)
937
+ * - Built on WebSocketAudioClient for robust protocol handling
938
+ * - Simple API: connect() → sendAudio() → stopRecording()
939
+ * - Type-safe message handling with callbacks
940
+ * - Automatic backpressure management
941
+ * - Overflow detection with buffer state tracking
942
+ *
943
+ * Example:
944
+ * ```typescript
945
+ * const client = new RealTimeTwoWayWebSocketRecognitionClient({
946
+ * url: 'ws://localhost:3101/ws/v1/recognize',
947
+ * onTranscript: (result) => console.log(result.finalTranscript),
948
+ * onError: (error) => console.error(error),
949
+ * maxBufferDurationSec: 60 // Ring buffer for 60 seconds
950
+ * });
951
+ *
952
+ * await client.connect();
953
+ *
954
+ * // Send audio chunks - always stored in ring buffer, sent if connected
955
+ * micStream.on('data', (chunk) => client.sendAudio(chunk));
956
+ *
957
+ * // Signal end of audio and wait for final results
958
+ * await client.stopRecording();
959
+ *
960
+ * // Server will close connection after sending finals
961
+ * // No manual cleanup needed - browser handles it
962
+ * ```
963
+ */
964
+
965
+ /**
966
+ * Check if a WebSocket close code indicates normal closure
967
+ * @param code - WebSocket close code
968
+ * @returns true if the disconnection was normal/expected, false if it was an error
969
+ */
970
+ declare function isNormalDisconnection(code: number): boolean;
971
+ /**
972
+ * Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility
973
+ */
974
+ type TranscriptionResult = TranscriptionResultV1;
975
+
976
+ /**
977
+ * RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition
978
+ *
979
+ * Implements IRecognitionClient interface for dependency injection and testing.
980
+ * Extends WebSocketAudioClient with local audio buffering and simple callback-based API.
981
+ */
982
+ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
983
+ private static readonly PROTOCOL_VERSION;
984
+ private config;
985
+ private audioBuffer;
986
+ private messageHandler;
987
+ private state;
988
+ private connectionPromise;
989
+ private isDebugLogEnabled;
990
+ private audioBytesSent;
991
+ private audioChunksSent;
992
+ private audioStatsLogInterval;
993
+ private lastAudioStatsLog;
994
+ constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig);
995
+ /**
996
+ * Internal logging helper - only logs if a logger was provided in config
997
+ * Debug logs are additionally gated by isDebugLogEnabled flag
998
+ * @param level - Log level: debug, info, warn, or error
999
+ * @param message - Message to log
1000
+ * @param data - Optional additional data to log
1001
+ */
1002
+ private log;
1003
+ /**
1004
+ * Clean up internal resources to free memory
1005
+ * Called when connection closes (normally or abnormally)
1006
+ */
1007
+ private cleanup;
1008
+ connect(): Promise<void>;
1009
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView): void;
1010
+ stopRecording(): Promise<void>;
1011
+ getAudioUtteranceId(): string;
1012
+ getState(): ClientState;
1013
+ isConnected(): boolean;
1014
+ isConnecting(): boolean;
1015
+ isStopping(): boolean;
1016
+ isTranscriptionFinished(): boolean;
1017
+ isBufferOverflowing(): boolean;
1018
+ getStats(): IRecognitionClientStats;
1019
+ protected onConnected(): void;
1020
+ protected onDisconnected(code: number, reason: string): void;
1021
+ protected onError(error: Event): void;
1022
+ protected onMessage(msg: {
1023
+ v: number;
1024
+ type: string;
1025
+ data: any;
1026
+ }): void;
1027
+ /**
1028
+ * Handle control messages from server
1029
+ * @param msg - Control message containing server actions
1030
+ */
1031
+ private handleControlMessage;
1032
+ /**
1033
+ * Send audio immediately to the server (without buffering)
1034
+ * @param audioData - Audio data to send
1035
+ */
1036
+ private sendAudioNow;
1037
+ }
1038
+
1039
+ export { type ASRRequestConfig as A, ClientState as C, DeepgramModel as D, type ErrorResultV1 as E, type FunctionCallResultV1 as F, type GameContextV1 as G, type IRecognitionClient as I, Language as L, type MetadataResultV1 as M, type RecognitionCallbackUrl as R, SampleRate as S, type TranscriptionResultV1 as T, type RealTimeTwoWayWebSocketRecognitionClientConfig as a, type IRecognitionClientConfig as b, RealTimeTwoWayWebSocketRecognitionClient as c, type TranscriptionResult as d, type IRecognitionClientStats as e, AudioEncoding as f, RecognitionContextTypeV1 as g, ControlSignalTypeV1 as h, isNormalDisconnection as i, RecognitionResultTypeV1 as j, type ASRRequestV1 as k, RecognitionProvider as l, GoogleModel as m };