@volley/recognition-client-sdk-node22 0.1.424

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +344 -0
  2. package/dist/browser.bundled.d.ts +1280 -0
  3. package/dist/browser.d.ts +10 -0
  4. package/dist/browser.d.ts.map +1 -0
  5. package/dist/config-builder.d.ts +134 -0
  6. package/dist/config-builder.d.ts.map +1 -0
  7. package/dist/errors.d.ts +41 -0
  8. package/dist/errors.d.ts.map +1 -0
  9. package/dist/factory.d.ts +36 -0
  10. package/dist/factory.d.ts.map +1 -0
  11. package/dist/index.bundled.d.ts +2572 -0
  12. package/dist/index.d.ts +16 -0
  13. package/dist/index.d.ts.map +1 -0
  14. package/dist/index.js +10199 -0
  15. package/dist/index.js.map +7 -0
  16. package/dist/recog-client-sdk.browser.d.ts +10 -0
  17. package/dist/recog-client-sdk.browser.d.ts.map +1 -0
  18. package/dist/recog-client-sdk.browser.js +5746 -0
  19. package/dist/recog-client-sdk.browser.js.map +7 -0
  20. package/dist/recognition-client.d.ts +128 -0
  21. package/dist/recognition-client.d.ts.map +1 -0
  22. package/dist/recognition-client.types.d.ts +271 -0
  23. package/dist/recognition-client.types.d.ts.map +1 -0
  24. package/dist/simplified-vgf-recognition-client.d.ts +178 -0
  25. package/dist/simplified-vgf-recognition-client.d.ts.map +1 -0
  26. package/dist/utils/audio-ring-buffer.d.ts +69 -0
  27. package/dist/utils/audio-ring-buffer.d.ts.map +1 -0
  28. package/dist/utils/message-handler.d.ts +45 -0
  29. package/dist/utils/message-handler.d.ts.map +1 -0
  30. package/dist/utils/url-builder.d.ts +28 -0
  31. package/dist/utils/url-builder.d.ts.map +1 -0
  32. package/dist/vgf-recognition-mapper.d.ts +66 -0
  33. package/dist/vgf-recognition-mapper.d.ts.map +1 -0
  34. package/dist/vgf-recognition-state.d.ts +91 -0
  35. package/dist/vgf-recognition-state.d.ts.map +1 -0
  36. package/package.json +74 -0
  37. package/src/browser.ts +24 -0
  38. package/src/config-builder.spec.ts +265 -0
  39. package/src/config-builder.ts +240 -0
  40. package/src/errors.ts +84 -0
  41. package/src/factory.spec.ts +215 -0
  42. package/src/factory.ts +47 -0
  43. package/src/index.ts +127 -0
  44. package/src/recognition-client.spec.ts +889 -0
  45. package/src/recognition-client.ts +844 -0
  46. package/src/recognition-client.types.ts +338 -0
  47. package/src/simplified-vgf-recognition-client.integration.spec.ts +718 -0
  48. package/src/simplified-vgf-recognition-client.spec.ts +1525 -0
  49. package/src/simplified-vgf-recognition-client.ts +524 -0
  50. package/src/utils/audio-ring-buffer.spec.ts +335 -0
  51. package/src/utils/audio-ring-buffer.ts +170 -0
  52. package/src/utils/message-handler.spec.ts +311 -0
  53. package/src/utils/message-handler.ts +131 -0
  54. package/src/utils/url-builder.spec.ts +252 -0
  55. package/src/utils/url-builder.ts +92 -0
  56. package/src/vgf-recognition-mapper.spec.ts +78 -0
  57. package/src/vgf-recognition-mapper.ts +232 -0
  58. package/src/vgf-recognition-state.ts +102 -0
@@ -0,0 +1,1280 @@
1
+ import { z } from 'zod';
2
+
3
+ /**
4
+ * Provider types and enums for recognition services
5
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
6
+ */
7
+ /**
8
+ * Supported speech recognition providers
9
+ */
10
+ declare enum RecognitionProvider {
11
+ ASSEMBLYAI = "assemblyai",
12
+ DEEPGRAM = "deepgram",
13
+ ELEVENLABS = "elevenlabs",
14
+ FIREWORKS = "fireworks",
15
+ GOOGLE = "google",
16
+ GEMINI_BATCH = "gemini-batch",
17
+ OPENAI_BATCH = "openai-batch",
18
+ OPENAI_REALTIME = "openai-realtime",
19
+ TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
20
+ TEST_ASR_STREAMING = "test-asr-streaming"
21
+ }
22
+ /**
23
+ * ASR API type - distinguishes between streaming and file-based transcription APIs
24
+ * - STREAMING: Real-time streaming APIs (Deepgram, AssemblyAI, Google)
25
+ * - FILE_BASED: File upload/batch APIs (OpenAI Batch, Gemini Batch)
26
+ */
27
+ declare enum ASRApiType {
28
+ STREAMING = "streaming",
29
+ FILE_BASED = "file-based"
30
+ }
31
+ /**
32
+ * Deepgram model names
33
+ */
34
+ declare enum DeepgramModel {
35
+ NOVA_2 = "nova-2",
36
+ NOVA_3 = "nova-3",
37
+ FLUX_GENERAL_EN = "flux-general-en"
38
+ }
39
+ /**
40
+ * Google Cloud Speech models
41
+ * @see https://cloud.google.com/speech-to-text/docs/transcription-model
42
+ * @see https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model
43
+ */
44
+ declare enum GoogleModel {
45
+ CHIRP_3 = "chirp_3",
46
+ CHIRP_2 = "chirp_2",
47
+ CHIRP = "chirp",
48
+ LATEST_LONG = "latest_long",
49
+ LATEST_SHORT = "latest_short",
50
+ TELEPHONY = "telephony",
51
+ TELEPHONY_SHORT = "telephony_short",
52
+ DEFAULT = "default",
53
+ COMMAND_AND_SEARCH = "command_and_search",
54
+ PHONE_CALL = "phone_call",
55
+ VIDEO = "video"
56
+ }
57
+ /**
58
+ * Fireworks AI models for ASR
59
+ * @see https://docs.fireworks.ai/guides/querying-asr-models
60
+ * @see https://fireworks.ai/models/fireworks/fireworks-asr-large
61
+ */
62
+ declare enum FireworksModel {
63
+ ASR_V1 = "fireworks-asr-large",
64
+ ASR_V2 = "fireworks-asr-v2",
65
+ WHISPER_V3 = "whisper-v3",
66
+ WHISPER_V3_TURBO = "whisper-v3-turbo"
67
+ }
68
+ /**
69
+ * ElevenLabs Scribe models for speech-to-text
70
+ * @see https://elevenlabs.io/blog/introducing-scribe-v2-realtime
71
+ * @see https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming
72
+ * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert
73
+ */
74
+ declare enum ElevenLabsModel {
75
+ SCRIBE_V2_REALTIME = "scribe_v2_realtime",
76
+ SCRIBE_V1 = "scribe_v1"
77
+ }
78
+ /**
79
+ * OpenAI Realtime API transcription models
80
+ * These are the verified `input_audio_transcription.model` values.
81
+ * @see https://platform.openai.com/docs/guides/realtime
82
+ */
83
+ declare enum OpenAIRealtimeModel {
84
+ GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
85
+ }
86
+ /**
87
+ * Type alias for any model from any provider
88
+ */
89
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | ElevenLabsModel | OpenAIRealtimeModel | string;
90
+
91
+ /**
92
+ * Audio encoding types
93
+ */
94
+ declare enum AudioEncoding {
95
+ ENCODING_UNSPECIFIED = 0,
96
+ LINEAR16 = 1,
97
+ OGG_OPUS = 2,
98
+ FLAC = 3,
99
+ MULAW = 4,
100
+ ALAW = 5
101
+ }
102
+ declare namespace AudioEncoding {
103
+ /**
104
+ * Convert numeric ID to AudioEncoding enum
105
+ * @param id - Numeric encoding identifier (0-5)
106
+ * @returns AudioEncoding enum value or undefined if invalid
107
+ */
108
+ function fromId(id: number): AudioEncoding | undefined;
109
+ /**
110
+ * Convert string name to AudioEncoding enum
111
+ * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
112
+ * @returns AudioEncoding enum value or undefined if invalid
113
+ */
114
+ function fromName(nameStr: string): AudioEncoding | undefined;
115
+ /**
116
+ * Convert AudioEncoding enum to numeric ID
117
+ * @param encoding - AudioEncoding enum value
118
+ * @returns Numeric ID (0-5)
119
+ */
120
+ function toId(encoding: AudioEncoding): number;
121
+ /**
122
+ * Convert AudioEncoding enum to string name
123
+ * @param encoding - AudioEncoding enum value
124
+ * @returns String name like "LINEAR16", "MULAW", etc.
125
+ */
126
+ function toName(encoding: AudioEncoding): string;
127
+ /**
128
+ * Check if a numeric ID is a valid encoding
129
+ * @param id - Numeric identifier to validate
130
+ * @returns true if valid encoding ID
131
+ */
132
+ function isIdValid(id: number): boolean;
133
+ /**
134
+ * Check if a string name is a valid encoding
135
+ * @param nameStr - String name to validate
136
+ * @returns true if valid encoding name
137
+ */
138
+ function isNameValid(nameStr: string): boolean;
139
+ }
140
+ /**
141
+ * Common sample rates (in Hz)
142
+ */
143
+ declare enum SampleRate {
144
+ RATE_8000 = 8000,
145
+ RATE_16000 = 16000,
146
+ RATE_22050 = 22050,
147
+ RATE_24000 = 24000,
148
+ RATE_32000 = 32000,
149
+ RATE_44100 = 44100,
150
+ RATE_48000 = 48000
151
+ }
152
+ declare namespace SampleRate {
153
+ /**
154
+ * Convert Hz value to SampleRate enum
155
+ * @param hz - Sample rate in Hz (8000, 16000, etc.)
156
+ * @returns SampleRate enum value or undefined if invalid
157
+ */
158
+ function fromHz(hz: number): SampleRate | undefined;
159
+ /**
160
+ * Convert string name to SampleRate enum
161
+ * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
162
+ * @returns SampleRate enum value or undefined if invalid
163
+ */
164
+ function fromName(nameStr: string): SampleRate | undefined;
165
+ /**
166
+ * Convert SampleRate enum to Hz value
167
+ * @param rate - SampleRate enum value
168
+ * @returns Hz value (8000, 16000, etc.)
169
+ */
170
+ function toHz(rate: SampleRate): number;
171
+ /**
172
+ * Convert SampleRate enum to string name
173
+ * @param rate - SampleRate enum value
174
+ * @returns String name like "RATE_8000", "RATE_16000", etc.
175
+ */
176
+ function toName(rate: SampleRate): string;
177
+ /**
178
+ * Check if a numeric Hz value is a valid sample rate
179
+ * @param hz - Hz value to validate
180
+ * @returns true if valid sample rate
181
+ */
182
+ function isHzValid(hz: number): boolean;
183
+ /**
184
+ * Check if a string name is a valid sample rate
185
+ * @param nameStr - String name to validate
186
+ * @returns true if valid sample rate name
187
+ */
188
+ function isNameValid(nameStr: string): boolean;
189
+ }
190
+ /**
191
+ * Supported languages for recognition
192
+ * Using BCP-47 language tags
193
+ */
194
+ declare enum Language {
195
+ ENGLISH_US = "en-US",
196
+ ENGLISH_GB = "en-GB",
197
+ SPANISH_ES = "es-ES",
198
+ SPANISH_MX = "es-MX",
199
+ FRENCH_FR = "fr-FR",
200
+ GERMAN_DE = "de-DE",
201
+ ITALIAN_IT = "it-IT",
202
+ PORTUGUESE_BR = "pt-BR",
203
+ JAPANESE_JP = "ja-JP",
204
+ KOREAN_KR = "ko-KR",
205
+ CHINESE_CN = "zh-CN",
206
+ CHINESE_TW = "zh-TW"
207
+ }
208
+
209
+ /**
210
+ * Recognition Result Types V1
211
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
212
+ * Types and schemas for recognition results sent to SDK clients
213
+ */
214
+
215
+ /**
216
+ * Message type discriminator for recognition results V1
217
+ */
218
+ declare enum RecognitionResultTypeV1 {
219
+ TRANSCRIPTION = "Transcription",
220
+ FUNCTION_CALL = "FunctionCall",
221
+ METADATA = "Metadata",
222
+ ERROR = "Error",
223
+ CLIENT_CONTROL_MESSAGE = "ClientControlMessage",
224
+ AUDIO_METRICS = "AudioMetrics"
225
+ }
226
+ /**
227
+ * Transcription result V1 - contains transcript message
228
+ * In the long run game side should not need to know it. In the short run it is send back to client.
229
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
230
+ */
231
+ declare const TranscriptionResultSchemaV1: z.ZodObject<{
232
+ type: z.ZodLiteral<RecognitionResultTypeV1.TRANSCRIPTION>;
233
+ audioUtteranceId: z.ZodString;
234
+ finalTranscript: z.ZodString;
235
+ finalTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
236
+ pendingTranscript: z.ZodOptional<z.ZodString>;
237
+ pendingTranscriptConfidence: z.ZodOptional<z.ZodNumber>;
238
+ is_finished: z.ZodBoolean;
239
+ voiceStart: z.ZodOptional<z.ZodNumber>;
240
+ voiceDuration: z.ZodOptional<z.ZodNumber>;
241
+ voiceEnd: z.ZodOptional<z.ZodNumber>;
242
+ startTimestamp: z.ZodOptional<z.ZodNumber>;
243
+ endTimestamp: z.ZodOptional<z.ZodNumber>;
244
+ receivedAtMs: z.ZodOptional<z.ZodNumber>;
245
+ accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
246
+ }, "strip", z.ZodTypeAny, {
247
+ type: RecognitionResultTypeV1.TRANSCRIPTION;
248
+ audioUtteranceId: string;
249
+ finalTranscript: string;
250
+ is_finished: boolean;
251
+ finalTranscriptConfidence?: number | undefined;
252
+ pendingTranscript?: string | undefined;
253
+ pendingTranscriptConfidence?: number | undefined;
254
+ voiceStart?: number | undefined;
255
+ voiceDuration?: number | undefined;
256
+ voiceEnd?: number | undefined;
257
+ startTimestamp?: number | undefined;
258
+ endTimestamp?: number | undefined;
259
+ receivedAtMs?: number | undefined;
260
+ accumulatedAudioTimeMs?: number | undefined;
261
+ }, {
262
+ type: RecognitionResultTypeV1.TRANSCRIPTION;
263
+ audioUtteranceId: string;
264
+ finalTranscript: string;
265
+ is_finished: boolean;
266
+ finalTranscriptConfidence?: number | undefined;
267
+ pendingTranscript?: string | undefined;
268
+ pendingTranscriptConfidence?: number | undefined;
269
+ voiceStart?: number | undefined;
270
+ voiceDuration?: number | undefined;
271
+ voiceEnd?: number | undefined;
272
+ startTimestamp?: number | undefined;
273
+ endTimestamp?: number | undefined;
274
+ receivedAtMs?: number | undefined;
275
+ accumulatedAudioTimeMs?: number | undefined;
276
+ }>;
277
+ type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
278
+ /**
279
+ * Function call result V1 - similar to LLM function call
280
+ * In the long run game server should know it, rather than TV or client.
281
+ */
282
+ declare const FunctionCallResultSchemaV1: z.ZodObject<{
283
+ type: z.ZodLiteral<RecognitionResultTypeV1.FUNCTION_CALL>;
284
+ audioUtteranceId: z.ZodString;
285
+ functionName: z.ZodString;
286
+ functionArgJson: z.ZodString;
287
+ }, "strip", z.ZodTypeAny, {
288
+ type: RecognitionResultTypeV1.FUNCTION_CALL;
289
+ audioUtteranceId: string;
290
+ functionName: string;
291
+ functionArgJson: string;
292
+ }, {
293
+ type: RecognitionResultTypeV1.FUNCTION_CALL;
294
+ audioUtteranceId: string;
295
+ functionName: string;
296
+ functionArgJson: string;
297
+ }>;
298
+ type FunctionCallResultV1 = z.infer<typeof FunctionCallResultSchemaV1>;
299
+ /**
300
+ * Transcript outcome type - categorizes final transcript state
301
+ * Used in Metadata schema. Maps 1:1 with Datadog metrics:
302
+ * - WITH_CONTENT → recog.client.websocket.transcript.final_with_content
303
+ * - EMPTY → recog.client.websocket.transcript.final_empty
304
+ * - NEVER_SENT → derived from sessions.streamed - final_with_content - final_empty
305
+ * - ERROR_* → 1:1 mapping to ErrorTypeV1 for error-caused outcomes
306
+ */
307
+ declare enum TranscriptOutcomeType {
308
+ WITH_CONTENT = "with_content",
309
+ EMPTY = "empty",
310
+ NEVER_SENT = "never_sent",
311
+ ERROR_AUTHENTICATION = "error_authentication",
312
+ ERROR_VALIDATION = "error_validation",
313
+ ERROR_PROVIDER = "error_provider",
314
+ ERROR_TIMEOUT = "error_timeout",
315
+ ERROR_QUOTA = "error_quota",
316
+ ERROR_INTERNAL_QUOTA = "error_internal_quota",
317
+ ERROR_CONNECTION = "error_connection",
318
+ ERROR_NO_AUDIO = "error_no_audio",
319
+ ERROR_CIRCUIT_BREAKER = "error_circuit_breaker",
320
+ ERROR_UNKNOWN = "error_unknown"
321
+ }
322
+ /**
323
+ * Metadata result V1 - contains metadata, timing information, and ASR config
324
+ * Sent when the provider connection closes to provide final timing metrics and config
325
+ * In the long run game server should know it, rather than TV or client.
326
+ */
327
+ declare const MetadataResultSchemaV1: z.ZodObject<{
328
+ type: z.ZodLiteral<RecognitionResultTypeV1.METADATA>;
329
+ audioUtteranceId: z.ZodString;
330
+ recordingStartMs: z.ZodOptional<z.ZodNumber>;
331
+ recordingEndMs: z.ZodOptional<z.ZodNumber>;
332
+ transcriptEndMs: z.ZodOptional<z.ZodNumber>;
333
+ socketCloseAtMs: z.ZodOptional<z.ZodNumber>;
334
+ duration: z.ZodOptional<z.ZodNumber>;
335
+ volume: z.ZodOptional<z.ZodNumber>;
336
+ accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
337
+ costInUSD: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
338
+ apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
339
+ asrConfig: z.ZodOptional<z.ZodString>;
340
+ rawAsrMetadata: z.ZodOptional<z.ZodString>;
341
+ transcriptOutcome: z.ZodOptional<z.ZodNativeEnum<typeof TranscriptOutcomeType>>;
342
+ audioMetrics: z.ZodOptional<z.ZodObject<{
343
+ valid: z.ZodBoolean;
344
+ audioBeginMs: z.ZodNumber;
345
+ audioEndMs: z.ZodNumber;
346
+ maxVolume: z.ZodNumber;
347
+ minVolume: z.ZodNumber;
348
+ avgVolume: z.ZodNumber;
349
+ silenceRatio: z.ZodNumber;
350
+ clippingRatio: z.ZodNumber;
351
+ snrEstimate: z.ZodNullable<z.ZodNumber>;
352
+ lastNonSilenceMs: z.ZodNumber;
353
+ timestamp: z.ZodString;
354
+ }, "strip", z.ZodTypeAny, {
355
+ valid: boolean;
356
+ audioBeginMs: number;
357
+ audioEndMs: number;
358
+ maxVolume: number;
359
+ minVolume: number;
360
+ avgVolume: number;
361
+ silenceRatio: number;
362
+ clippingRatio: number;
363
+ snrEstimate: number | null;
364
+ lastNonSilenceMs: number;
365
+ timestamp: string;
366
+ }, {
367
+ valid: boolean;
368
+ audioBeginMs: number;
369
+ audioEndMs: number;
370
+ maxVolume: number;
371
+ minVolume: number;
372
+ avgVolume: number;
373
+ silenceRatio: number;
374
+ clippingRatio: number;
375
+ snrEstimate: number | null;
376
+ lastNonSilenceMs: number;
377
+ timestamp: string;
378
+ }>>;
379
+ }, "strip", z.ZodTypeAny, {
380
+ type: RecognitionResultTypeV1.METADATA;
381
+ audioUtteranceId: string;
382
+ recordingStartMs?: number | undefined;
383
+ recordingEndMs?: number | undefined;
384
+ transcriptEndMs?: number | undefined;
385
+ socketCloseAtMs?: number | undefined;
386
+ duration?: number | undefined;
387
+ volume?: number | undefined;
388
+ accumulatedAudioTimeMs?: number | undefined;
389
+ costInUSD?: number | undefined;
390
+ apiType?: ASRApiType | undefined;
391
+ asrConfig?: string | undefined;
392
+ rawAsrMetadata?: string | undefined;
393
+ transcriptOutcome?: TranscriptOutcomeType | undefined;
394
+ audioMetrics?: {
395
+ valid: boolean;
396
+ audioBeginMs: number;
397
+ audioEndMs: number;
398
+ maxVolume: number;
399
+ minVolume: number;
400
+ avgVolume: number;
401
+ silenceRatio: number;
402
+ clippingRatio: number;
403
+ snrEstimate: number | null;
404
+ lastNonSilenceMs: number;
405
+ timestamp: string;
406
+ } | undefined;
407
+ }, {
408
+ type: RecognitionResultTypeV1.METADATA;
409
+ audioUtteranceId: string;
410
+ recordingStartMs?: number | undefined;
411
+ recordingEndMs?: number | undefined;
412
+ transcriptEndMs?: number | undefined;
413
+ socketCloseAtMs?: number | undefined;
414
+ duration?: number | undefined;
415
+ volume?: number | undefined;
416
+ accumulatedAudioTimeMs?: number | undefined;
417
+ costInUSD?: number | undefined;
418
+ apiType?: ASRApiType | undefined;
419
+ asrConfig?: string | undefined;
420
+ rawAsrMetadata?: string | undefined;
421
+ transcriptOutcome?: TranscriptOutcomeType | undefined;
422
+ audioMetrics?: {
423
+ valid: boolean;
424
+ audioBeginMs: number;
425
+ audioEndMs: number;
426
+ maxVolume: number;
427
+ minVolume: number;
428
+ avgVolume: number;
429
+ silenceRatio: number;
430
+ clippingRatio: number;
431
+ snrEstimate: number | null;
432
+ lastNonSilenceMs: number;
433
+ timestamp: string;
434
+ } | undefined;
435
+ }>;
436
+ type MetadataResultV1 = z.infer<typeof MetadataResultSchemaV1>;
437
+ /**
438
+ * Error type enum V1 - categorizes different types of errors
439
+ */
440
+ declare enum ErrorTypeV1 {
441
+ AUTHENTICATION_ERROR = "authentication_error",
442
+ VALIDATION_ERROR = "validation_error",
443
+ PROVIDER_ERROR = "provider_error",
444
+ TIMEOUT_ERROR = "timeout_error",
445
+ QUOTA_EXCEEDED = "quota_exceeded",
446
+ INTERNAL_QUOTA_EXHAUSTED = "internal_quota_exhausted",
447
+ CONNECTION_ERROR = "connection_error",
448
+ NO_AUDIO_ERROR = "no_audio_error",
449
+ CIRCUIT_BREAKER_OPEN = "circuit_breaker_open",
450
+ UNKNOWN_ERROR = "unknown_error"
451
+ }
452
+ /**
453
+ * Error result V1 - contains error message
454
+ * In the long run game server should know it, rather than TV or client.
455
+ */
456
+ declare const ErrorResultSchemaV1: z.ZodObject<{
457
+ type: z.ZodLiteral<RecognitionResultTypeV1.ERROR>;
458
+ audioUtteranceId: z.ZodString;
459
+ errorType: z.ZodOptional<z.ZodNativeEnum<typeof ErrorTypeV1>>;
460
+ message: z.ZodOptional<z.ZodString>;
461
+ code: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodNumber]>>;
462
+ description: z.ZodOptional<z.ZodString>;
463
+ }, "strip", z.ZodTypeAny, {
464
+ type: RecognitionResultTypeV1.ERROR;
465
+ audioUtteranceId: string;
466
+ errorType?: ErrorTypeV1 | undefined;
467
+ message?: string | undefined;
468
+ code?: string | number | undefined;
469
+ description?: string | undefined;
470
+ }, {
471
+ type: RecognitionResultTypeV1.ERROR;
472
+ audioUtteranceId: string;
473
+ errorType?: ErrorTypeV1 | undefined;
474
+ message?: string | undefined;
475
+ code?: string | number | undefined;
476
+ description?: string | undefined;
477
+ }>;
478
+ type ErrorResultV1 = z.infer<typeof ErrorResultSchemaV1>;
479
+
480
+ /**
481
+ * Recognition Context Types V1
482
+ * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
483
+ * Types and schemas for recognition context data
484
+ */
485
+
486
+ /**
487
+ * Message type discriminator for recognition context V1
488
+ */
489
+ declare enum RecognitionContextTypeV1 {
490
+ GAME_CONTEXT = "GameContext",
491
+ CONTROL_SIGNAL = "ControlSignal",
492
+ ASR_REQUEST = "ASRRequest"
493
+ }
494
+ /**
495
+ * Control signal types for recognition V1
496
+ */
497
+ declare enum ControlSignalTypeV1 {
498
+ START_RECORDING = "start_recording",
499
+ STOP_RECORDING = "stop_recording"
500
+ }
501
+ /**
502
+ * Game context V1 - contains game state information
503
+ */
504
+ declare const GameContextSchemaV1: z.ZodObject<{
505
+ type: z.ZodLiteral<RecognitionContextTypeV1.GAME_CONTEXT>;
506
+ gameId: z.ZodString;
507
+ gamePhase: z.ZodString;
508
+ promptSTT: z.ZodOptional<z.ZodString>;
509
+ promptSTF: z.ZodOptional<z.ZodString>;
510
+ promptTTF: z.ZodOptional<z.ZodString>;
511
+ slotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
512
+ }, "strip", z.ZodTypeAny, {
513
+ type: RecognitionContextTypeV1.GAME_CONTEXT;
514
+ gameId: string;
515
+ gamePhase: string;
516
+ promptSTT?: string | undefined;
517
+ promptSTF?: string | undefined;
518
+ promptTTF?: string | undefined;
519
+ slotMap?: Record<string, string[]> | undefined;
520
+ }, {
521
+ type: RecognitionContextTypeV1.GAME_CONTEXT;
522
+ gameId: string;
523
+ gamePhase: string;
524
+ promptSTT?: string | undefined;
525
+ promptSTF?: string | undefined;
526
+ promptTTF?: string | undefined;
527
+ slotMap?: Record<string, string[]> | undefined;
528
+ }>;
529
+ type GameContextV1 = z.infer<typeof GameContextSchemaV1>;
530
+
531
+ /**
532
+ * Unified ASR Request Configuration
533
+ *
534
+ * Provider-agnostic configuration for ASR (Automatic Speech Recognition) requests.
535
+ * This interface provides a consistent API for clients regardless of the underlying provider.
536
+ *
537
+ * All fields use library-defined enums for type safety and consistency.
538
+ * Provider-specific mappers will convert these to provider-native formats.
539
+ */
540
+
541
+ /**
542
+ * Final transcript stability modes
543
+ *
544
+ * Controls timeout duration for fallback final transcript after stopRecording().
545
+ * Similar to AssemblyAI's turn detection confidence modes but applied to our
546
+ * internal timeout mechanism when vendors don't respond with is_final=true.
547
+ *
548
+ * @see https://www.assemblyai.com/docs/speech-to-text/universal-streaming/turn-detection
549
+ */
550
+ declare enum FinalTranscriptStability {
551
+ /**
552
+ * Aggressive mode: 100ms timeout
553
+ * Fast response, optimized for short utterances and quick back-and-forth
554
+ * Use cases: IVR, quick commands, retail confirmations
555
+ */
556
+ AGGRESSIVE = "aggressive",
557
+ /**
558
+ * Balanced mode: 200ms timeout (default)
559
+ * Natural middle ground for most conversational scenarios
560
+ * Use cases: General customer support, tech support, typical voice interactions
561
+ */
562
+ BALANCED = "balanced",
563
+ /**
564
+ * Conservative mode: 400ms timeout
565
+ * Wait longer for providers, optimized for complex/reflective speech
566
+ * Use cases: Healthcare, complex queries, careful thought processes
567
+ */
568
+ CONSERVATIVE = "conservative",
569
+ /**
570
+ * Experimental mode: 10000ms (10 seconds) timeout
571
+ * Very long wait for batch/async providers that need significant processing time
572
+ * Use cases: Batch processing (Gemini, OpenAI Whisper), complex audio analysis
573
+ * Note: Should be cancelled immediately when transcript is received
574
+ */
575
+ EXPERIMENTAL = "experimental"
576
+ }
577
+ /**
578
+ * Unified ASR request configuration
579
+ *
580
+ * This configuration is used by:
581
+ * - Client SDKs to specify recognition parameters
582
+ * - Demo applications for user input
583
+ * - Service layer to configure provider sessions
584
+ *
585
+ * Core fields only - all provider-specific options go in providerOptions
586
+ *
587
+ * @example
588
+ * ```typescript
589
+ * const config: ASRRequestConfig = {
590
+ * provider: RecognitionProvider.GOOGLE,
591
+ * model: GoogleModel.LATEST_LONG,
592
+ * language: Language.ENGLISH_US,
593
+ * sampleRate: SampleRate.RATE_16000, // or just 16000
594
+ * encoding: AudioEncoding.LINEAR16,
595
+ * providerOptions: {
596
+ * google: {
597
+ * enableAutomaticPunctuation: true,
598
+ * interimResults: true,
599
+ * singleUtterance: false
600
+ * }
601
+ * }
602
+ * };
603
+ * ```
604
+ */
605
+ interface ASRRequestConfig {
606
+ /**
607
+ * The ASR provider to use
608
+ * Must be one of the supported providers in RecognitionProvider enum
609
+ */
610
+ provider: RecognitionProvider | string;
611
+ /**
612
+ * Optional model specification for the provider
613
+ * Can be provider-specific model enum or string
614
+ * If not specified, provider's default model will be used
615
+ */
616
+ model?: RecognitionModel;
617
+ /**
618
+ * Language/locale for recognition
619
+ * Use Language enum for common languages
620
+ * Can also accept BCP-47 language tags as strings
621
+ */
622
+ language: Language | string;
623
+ /**
624
+ * Audio sample rate in Hz
625
+ * Prefer using SampleRate enum values for standard rates
626
+ * Can also accept numeric Hz values (e.g., 16000)
627
+ */
628
+ sampleRate: SampleRate | number;
629
+ /**
630
+ * Audio encoding format
631
+ * Must match the actual audio data being sent
632
+ * Use AudioEncoding enum for standard formats
633
+ */
634
+ encoding: AudioEncoding | string;
635
+ /**
636
+ * Enable interim (partial) results during recognition
637
+ * When true, receive real-time updates before finalization
638
+ * When false, only receive final results
639
+ * Default: false
640
+ */
641
+ interimResults?: boolean;
642
+ /**
643
+ * Require GameContext before starting recognition such as song titles
644
+ * When true, server waits for GameContext message before processing audio
645
+ * When false, recognition starts immediately
646
+ * Default: false
647
+ */
648
+ useContext?: boolean;
649
+ /**
650
+ * Final transcript stability mode
651
+ *
652
+ * Controls timeout duration for fallback final transcript when provider
653
+ * doesn't respond with is_final=true after stopRecording().
654
+ *
655
+ * - aggressive: 100ms - fast response, may cut off slow providers
656
+ * - balanced: 200ms - current default, good for most cases
657
+ * - conservative: 400ms - wait longer for complex utterances
658
+ *
659
+ * @default 'balanced'
660
+ * @see FinalTranscriptStability enum for detailed descriptions
661
+ */
662
+ finalTranscriptStability?: FinalTranscriptStability | string;
663
+ /**
664
+ * Traffic control priority for quota slot allocation
665
+ *
666
+ * Controls which quota slots this request can use when traffic control is enabled.
667
+ * The quota system reserves a portion of slots for high-priority requests.
668
+ *
669
+ * - 'high': Can use all quota slots (reserved for critical games like song-quiz)
670
+ * - 'low': Limited to non-reserved slots (default for most requests)
671
+ *
672
+ * @default 'low'
673
+ */
674
+ priority?: 'low' | 'high';
675
+ /**
676
+ * Additional provider-specific options
677
+ *
678
+ * Common options per provider:
679
+ * - Deepgram: punctuate, smart_format, diarize, utterances
680
+ * - Google: enableAutomaticPunctuation, singleUtterance, enableWordTimeOffsets
681
+ * - AssemblyAI: formatTurns, filter_profanity, word_boost
682
+ *
683
+ * Note: interimResults is now a top-level field, but can still be overridden per provider
684
+ *
685
+ * @example
686
+ * ```typescript
687
+ * providerOptions: {
688
+ * google: {
689
+ * enableAutomaticPunctuation: true,
690
+ * singleUtterance: false,
691
+ * enableWordTimeOffsets: false
692
+ * }
693
+ * }
694
+ * ```
695
+ */
696
+ providerOptions?: Record<string, any>;
697
+ /**
698
+ * Optional fallback ASR configurations
699
+ *
700
+ * List of alternative ASR configurations to use if the primary fails.
701
+ * Each fallback config is a complete ASRRequestConfig that will be tried
702
+ * in order until one succeeds.
703
+ *
704
+ * @example
705
+ * ```typescript
706
+ * fallbackModels: [
707
+ * {
708
+ * provider: RecognitionProvider.DEEPGRAM,
709
+ * model: DeepgramModel.NOVA_2,
710
+ * language: Language.ENGLISH_US,
711
+ * sampleRate: 16000,
712
+ * encoding: AudioEncoding.LINEAR16
713
+ * },
714
+ * {
715
+ * provider: RecognitionProvider.GOOGLE,
716
+ * model: GoogleModel.LATEST_SHORT,
717
+ * language: Language.ENGLISH_US,
718
+ * sampleRate: 16000,
719
+ * encoding: AudioEncoding.LINEAR16
720
+ * }
721
+ * ]
722
+ * ```
723
+ */
724
+ fallbackModels?: ASRRequestConfig[];
725
+ }
726
+
727
+ /**
728
+ * Standard stage/environment constants used across all services
729
+ */
730
+ declare const STAGES: {
731
+ readonly LOCAL: "local";
732
+ readonly DEV: "dev";
733
+ readonly STAGING: "staging";
734
+ readonly PRODUCTION: "production";
735
+ };
736
+ type Stage = typeof STAGES[keyof typeof STAGES];
737
+
738
+ /**
739
+ * Generic WebSocket protocol types and utilities
740
+ * Supports flexible versioning and message types
741
+ * Used by both client and server implementations
742
+ */
743
+
744
+ /**
745
+ * Base message structure - completely flexible
746
+ * @template V - Version type (number, string, etc.)
747
+ */
748
+ interface Message<V = number> {
749
+ v: V;
750
+ type: string;
751
+ data?: unknown;
752
+ }
753
+ /**
754
+ * Version serializer interface
755
+ * Converts between version type V and byte representation
756
+ */
757
+ interface VersionSerializer<V> {
758
+ serialize: (v: V) => number;
759
+ deserialize: (byte: number) => V;
760
+ }
761
+
762
+ /**
763
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
764
+ * Sends audio and control messages, receives responses from server
765
+ *
766
+ * Features:
767
+ * - Generic version type support (number, string, etc.)
768
+ * - Type-safe upward/downward message data
769
+ * - Client-side backpressure monitoring
770
+ * - Abstract hooks for application-specific logic
771
+ * - Format-agnostic audio protocol (supports any encoding)
772
+ */
773
+
774
+ type ClientConfig = {
775
+ url: string;
776
+ highWM?: number;
777
+ lowWM?: number;
778
+ };
779
+ /**
780
+ * WebSocketAudioClient - Abstract base class for WebSocket clients
781
+ * that send audio frames and JSON messages
782
+ *
783
+ * @template V - Version type (number, string, object, etc.)
784
+ * @template TUpward - Type of upward message data (Client -> Server)
785
+ * @template TDownward - Type of downward message data (Server -> Client)
786
+ *
787
+ * @example
788
+ * ```typescript
789
+ * class MyClient extends WebSocketAudioClient<number, MyUpMsg, MyDownMsg> {
790
+ * protected onConnected() {
791
+ * console.log('Connected!');
792
+ * }
793
+ *
794
+ * protected onMessage(msg) {
795
+ * console.log('Received:', msg.type, msg.data);
796
+ * }
797
+ *
798
+ * protected onDisconnected(code, reason) {
799
+ * console.log('Disconnected:', code, reason);
800
+ * }
801
+ *
802
+ * protected onError(error) {
803
+ * console.error('Error:', error);
804
+ * }
805
+ * }
806
+ *
807
+ * const client = new MyClient({ url: 'ws://localhost:8080' });
808
+ * client.connect();
809
+ * client.sendMessage(1, 'configure', { language: 'en' });
810
+ * client.sendAudio(audioData);
811
+ * ```
812
+ */
813
+ declare abstract class WebSocketAudioClient<V = number, // Version type (default: number)
814
+ TUpward = unknown, // Upward message data type
815
+ TDownward = unknown> {
816
+ private cfg;
817
+ protected versionSerializer: VersionSerializer<V>;
818
+ private ws;
819
+ private seq;
820
+ private HWM;
821
+ private LWM;
822
+ constructor(cfg: ClientConfig, versionSerializer?: VersionSerializer<V>);
823
+ /**
824
+ * Hook: Called when WebSocket connection is established
825
+ */
826
+ protected abstract onConnected(): void;
827
+ /**
828
+ * Hook: Called when WebSocket connection closes
829
+ * @param code - Close code (see WebSocketCloseCode enum)
830
+ * @param reason - Human-readable close reason
831
+ */
832
+ protected abstract onDisconnected(code: number, reason: string): void;
833
+ /**
834
+ * Hook: Called when WebSocket error occurs
835
+ */
836
+ protected abstract onError(error: Event): void;
837
+ /**
838
+ * Hook: Called when downward message arrives from server
839
+ * Override this to handle messages (optional - default does nothing)
840
+ */
841
+ protected onMessage(_msg: Message<V> & {
842
+ data: TDownward;
843
+ }): void;
844
+ connect(): void;
845
+ /**
846
+ * Send JSON message to server
847
+ * @param version - Message version
848
+ * @param type - Message type (developer defined)
849
+ * @param data - Message payload (typed)
850
+ */
851
+ sendMessage(version: V, type: string, data: TUpward): void;
852
+ /**
853
+ * Send audio frame with specified encoding and sample rate
854
+ * @param audioData - Audio data (any format: Int16Array, Uint8Array, ArrayBuffer, etc.)
855
+ * @param version - Audio frame version
856
+ * @param encodingId - Audio encoding ID (0-5, e.g., AudioEncoding.LINEAR16)
857
+ * @param sampleRate - Sample rate in Hz (e.g., 16000)
858
+ */
859
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView, version: V, encodingId: number, sampleRate: number): void;
860
+ /**
861
+ * Get current WebSocket buffer size
862
+ */
863
+ getBufferedAmount(): number;
864
+ /**
865
+ * Check if local buffer is backpressured
866
+ */
867
+ isLocalBackpressured(): boolean;
868
+ /**
869
+ * Check if ready to send audio
870
+ * Verifies: connection open, no local buffer pressure
871
+ */
872
+ canSend(): boolean;
873
+ /**
874
+ * Check if connection is open
875
+ */
876
+ isOpen(): boolean;
877
+ /**
878
+ * Get current connection state
879
+ */
880
+ getReadyState(): number;
881
+ /**
882
+ * Close the WebSocket connection
883
+ * Protected method for subclasses to implement disconnect logic
884
+ * @param code - WebSocket close code (default: 1000 = normal closure)
885
+ * @param reason - Human-readable close reason
886
+ */
887
+ protected closeConnection(code?: number, reason?: string): void;
888
+ }
889
+
890
+ /**
891
+ * Recognition Client Types
892
+ *
893
+ * Type definitions and interfaces for the recognition client SDK.
894
+ * These interfaces enable dependency injection, testing, and alternative implementations.
895
+ */
896
+
897
+ /**
898
+ * Client connection state enum
899
+ * Represents the various states a recognition client can be in during its lifecycle
900
+ */
901
+ declare enum ClientState {
902
+ /** Initial state, no connection established */
903
+ INITIAL = "initial",
904
+ /** Actively establishing WebSocket connection */
905
+ CONNECTING = "connecting",
906
+ /** WebSocket connected but waiting for server ready signal */
907
+ CONNECTED = "connected",
908
+ /** Server ready, can send audio */
909
+ READY = "ready",
910
+ /** Sent stop signal, waiting for final transcript */
911
+ STOPPING = "stopping",
912
+ /** Connection closed normally after stop */
913
+ STOPPED = "stopped",
914
+ /** Connection failed or lost unexpectedly */
915
+ FAILED = "failed"
916
+ }
917
+ /**
918
+ * Callback URL configuration with message type filtering
919
+ */
920
+ interface RecognitionCallbackUrl {
921
+ /** The callback URL endpoint */
922
+ url: string;
923
+ /** Array of message types to send to this URL. If empty/undefined, all types are sent */
924
+ messageTypes?: Array<string | number>;
925
+ }
926
+ interface IRecognitionClientConfig {
927
+ /**
928
+ * WebSocket endpoint URL (optional)
929
+ * Either `url` or `stage` must be provided.
930
+ * If both are provided, `url` takes precedence.
931
+ *
932
+ * Example with explicit URL:
933
+ * ```typescript
934
+ * { url: 'wss://custom-endpoint.example.com/ws/v1/recognize' }
935
+ * ```
936
+ */
937
+ url?: string;
938
+ /**
939
+ * Stage for recognition service (recommended)
940
+ * Either `url` or `stage` must be provided.
941
+ * If both are provided, `url` takes precedence.
942
+ * Defaults to production if neither is provided.
943
+ *
944
+ * Example with STAGES enum (recommended):
945
+ * ```typescript
946
+ * import { STAGES } from '@recog/shared-types';
947
+ * { stage: STAGES.STAGING }
948
+ * ```
949
+ *
950
+ * String values also accepted:
951
+ * ```typescript
952
+ * { stage: 'staging' } // STAGES.LOCAL | STAGES.DEV | STAGES.STAGING | STAGES.PRODUCTION
953
+ * ```
954
+ */
955
+ stage?: Stage | string;
956
+ /** ASR configuration (provider, model, language, etc.) - optional */
957
+ asrRequestConfig?: ASRRequestConfig;
958
+ /** Game context for improved recognition accuracy */
959
+ gameContext?: GameContextV1;
960
+ /**
961
+ * Game ID for tracking and routing purposes (optional)
962
+ * If provided, this is added to the WebSocket URL as a query parameter.
963
+ * If gameContext is also provided, this takes precedence over gameContext.gameId.
964
+ */
965
+ gameId?: string;
966
+ /** Audio utterance ID (optional) - if not provided, a UUID v4 will be generated */
967
+ audioUtteranceId?: string;
968
+ /** Callback URLs for server-side notifications with optional message type filtering (optional)
969
+ * Game side only need to use it if another service need to be notified about the transcription results.
970
+ */
971
+ callbackUrls?: RecognitionCallbackUrl[];
972
+ /** User identification (optional) */
973
+ userId?: string;
974
+ /** Game session identification (optional). called 'sessionId' in Platform and most games. */
975
+ gameSessionId?: string;
976
+ /** Device identification (optional) */
977
+ deviceId?: string;
978
+ /** Account identification (optional) */
979
+ accountId?: string;
980
+ /** Question answer identifier for tracking Q&A sessions (optional and tracking purpose only) */
981
+ questionAnswerId?: string;
982
+ /** Platform for audio recording device (optional, e.g., 'ios', 'android', 'web', 'unity') */
983
+ platform?: string;
984
+ /** Callback when transcript is received */
985
+ onTranscript?: (result: TranscriptionResultV1) => void;
986
+ /**
987
+ * Callback when function call is received
988
+ * Note: Not supported in 2025. P2 feature for future speech-to-function-call capability.
989
+ */
990
+ onFunctionCall?: (result: FunctionCallResultV1) => void;
991
+ /** Callback when metadata is received. Only once after transcription is complete.*/
992
+ onMetadata?: (metadata: MetadataResultV1) => void;
993
+ /** Callback when error occurs */
994
+ onError?: (error: ErrorResultV1) => void;
995
+ /** Callback when connected to WebSocket */
996
+ onConnected?: () => void;
997
+ /**
998
+ * Callback when WebSocket disconnects
999
+ * @param code - WebSocket close code (1000 = normal, 1006 = abnormal, etc.)
1000
+ * @param reason - Close reason string
1001
+ */
1002
+ onDisconnected?: (code: number, reason: string) => void;
1003
+ /** High water mark for backpressure control (bytes) */
1004
+ highWaterMark?: number;
1005
+ /** Low water mark for backpressure control (bytes) */
1006
+ lowWaterMark?: number;
1007
+ /** Maximum buffer duration in seconds (default: 60s) */
1008
+ maxBufferDurationSec?: number;
1009
+ /** Expected chunks per second for ring buffer sizing (default: 100) */
1010
+ chunksPerSecond?: number;
1011
+ /**
1012
+ * Connection retry configuration (optional)
1013
+ * Only applies to initial connection establishment, not mid-stream interruptions.
1014
+ *
1015
+ * Default: { maxAttempts: 4, delayMs: 200 } (try once, retry 3 times = 4 total attempts)
1016
+ *
1017
+ * Timing: Attempt 1 → FAIL → wait 200ms → Attempt 2 → FAIL → wait 200ms → Attempt 3 → FAIL → wait 200ms → Attempt 4
1018
+ *
1019
+ * Example:
1020
+ * ```typescript
1021
+ * {
1022
+ * connectionRetry: {
1023
+ * maxAttempts: 2, // Try connecting up to 2 times (1 retry)
1024
+ * delayMs: 500 // Wait 500ms between attempts
1025
+ * }
1026
+ * }
1027
+ * ```
1028
+ */
1029
+ connectionRetry?: {
1030
+ /** Maximum number of connection attempts (default: 4, min: 1, max: 5) */
1031
+ maxAttempts?: number;
1032
+ /** Delay in milliseconds between retry attempts (default: 200ms) */
1033
+ delayMs?: number;
1034
+ };
1035
+ /**
1036
+ * Optional logger function for debugging
1037
+ * If not provided, no logging will occur
1038
+ * @param level - Log level: 'debug', 'info', 'warn', 'error'
1039
+ * @param message - Log message
1040
+ * @param data - Optional additional data
1041
+ */
1042
+ logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void;
1043
+ }
1044
+ /**
1045
+ * Recognition Client Interface
1046
+ *
1047
+ * Main interface for real-time speech recognition clients.
1048
+ * Provides methods for connection management, audio streaming, and session control.
1049
+ */
1050
+ interface IRecognitionClient {
1051
+ /**
1052
+ * Connect to the WebSocket endpoint
1053
+ * @returns Promise that resolves when connected
1054
+ * @throws Error if connection fails or times out
1055
+ */
1056
+ connect(): Promise<void>;
1057
+ /**
1058
+ * Send audio data to the recognition service
1059
+ * Audio is buffered locally and sent when connection is ready.
1060
+ * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
1061
+ */
1062
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
1063
+ /**
1064
+ * Stop recording and wait for final transcript
1065
+ * The server will close the connection after sending the final transcript.
1066
+ * @returns Promise that resolves when final transcript is received
1067
+ */
1068
+ stopRecording(): Promise<void>;
1069
+ /**
1070
+ * Force stop and immediately close connection without waiting for server
1071
+ *
1072
+ * WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
1073
+ * - Does NOT wait for server to process remaining audio
1074
+ * - Does NOT receive final transcript from server
1075
+ * - Immediately closes WebSocket connection
1076
+ * - Cleans up resources (buffers, listeners)
1077
+ *
1078
+ * Use Cases:
1079
+ * - User explicitly cancels/abandons session
1080
+ * - Timeout scenarios where waiting is not acceptable
1081
+ * - Need immediate cleanup and can't wait for server
1082
+ *
1083
+ * RECOMMENDED: Use stopRecording() for normal shutdown.
1084
+ * Only use this when immediate disconnection is required.
1085
+ */
1086
+ stopAbnormally(): void;
1087
+ /**
1088
+ * Get the audio utterance ID for this session
1089
+ * Available immediately after client construction.
1090
+ * @returns UUID v4 string identifying this recognition session
1091
+ */
1092
+ getAudioUtteranceId(): string;
1093
+ /**
1094
+ * Get the current state of the client
1095
+ * @returns Current ClientState value
1096
+ */
1097
+ getState(): ClientState;
1098
+ /**
1099
+ * Check if WebSocket connection is open
1100
+ * @returns true if connected and ready to communicate
1101
+ */
1102
+ isConnected(): boolean;
1103
+ /**
1104
+ * Check if client is currently connecting
1105
+ * @returns true if connection is in progress
1106
+ */
1107
+ isConnecting(): boolean;
1108
+ /**
1109
+ * Check if client is currently stopping
1110
+ * @returns true if stopRecording() is in progress
1111
+ */
1112
+ isStopping(): boolean;
1113
+ /**
1114
+ * Check if transcription has finished
1115
+ * @returns true if the transcription is complete
1116
+ */
1117
+ isTranscriptionFinished(): boolean;
1118
+ /**
1119
+ * Check if the audio buffer has overflowed
1120
+ * @returns true if the ring buffer has wrapped around
1121
+ */
1122
+ isBufferOverflowing(): boolean;
1123
+ /**
1124
+ * Get client statistics
1125
+ * @returns Statistics about audio transmission and buffering
1126
+ */
1127
+ getStats(): IRecognitionClientStats;
1128
+ /**
1129
+ * Get the WebSocket URL being used by this client
1130
+ * Available immediately after client construction.
1131
+ * @returns WebSocket URL string
1132
+ */
1133
+ getUrl(): string;
1134
+ }
1135
+ /**
1136
+ * Client statistics interface
1137
+ */
1138
+ interface IRecognitionClientStats {
1139
+ /** Total audio bytes sent to server */
1140
+ audioBytesSent: number;
1141
+ /** Total number of audio chunks sent */
1142
+ audioChunksSent: number;
1143
+ /** Total number of audio chunks buffered */
1144
+ audioChunksBuffered: number;
1145
+ /** Number of times the ring buffer overflowed */
1146
+ bufferOverflowCount: number;
1147
+ /** Current number of chunks in buffer */
1148
+ currentBufferedChunks: number;
1149
+ /** Whether the ring buffer has wrapped (overwritten old data) */
1150
+ hasWrapped: boolean;
1151
+ }
1152
+ /**
1153
+ * Configuration for RealTimeTwoWayWebSocketRecognitionClient
1154
+ * This extends IRecognitionClientConfig and is the main configuration interface
1155
+ * for creating a new RealTimeTwoWayWebSocketRecognitionClient instance.
1156
+ */
1157
+ interface RealTimeTwoWayWebSocketRecognitionClientConfig extends IRecognitionClientConfig {
1158
+ }
1159
+
1160
+ /**
1161
+ * RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition
1162
+ *
1163
+ * Features:
1164
+ * - Ring buffer-based audio storage with fixed memory footprint
1165
+ * - Automatic buffering when disconnected, immediate send when connected
1166
+ * - Buffer persists after flush (for future retry/reconnection scenarios)
1167
+ * - Built on WebSocketAudioClient for robust protocol handling
1168
+ * - Simple API: connect() → sendAudio() → stopRecording()
1169
+ * - Type-safe message handling with callbacks
1170
+ * - Automatic backpressure management
1171
+ * - Overflow detection with buffer state tracking
1172
+ *
1173
+ * Example:
1174
+ * ```typescript
1175
+ * const client = new RealTimeTwoWayWebSocketRecognitionClient({
1176
+ * url: 'ws://localhost:3101/ws/v1/recognize',
1177
+ * onTranscript: (result) => console.log(result.finalTranscript),
1178
+ * onError: (error) => console.error(error),
1179
+ * maxBufferDurationSec: 60 // Ring buffer for 60 seconds
1180
+ * });
1181
+ *
1182
+ * await client.connect();
1183
+ *
1184
+ * // Send audio chunks - always stored in ring buffer, sent if connected
1185
+ * micStream.on('data', (chunk) => client.sendAudio(chunk));
1186
+ *
1187
+ * // Signal end of audio and wait for final results
1188
+ * await client.stopRecording();
1189
+ *
1190
+ * // Server will close connection after sending finals
1191
+ * // No manual cleanup needed - browser handles it
1192
+ * ```
1193
+ */
1194
+
1195
+ /**
1196
+ * Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility
1197
+ */
1198
+ type TranscriptionResult = TranscriptionResultV1;
1199
+
1200
+ /**
1201
+ * RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition
1202
+ *
1203
+ * Implements IRecognitionClient interface for dependency injection and testing.
1204
+ * Extends WebSocketAudioClient with local audio buffering and simple callback-based API.
1205
+ */
1206
+ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioClient<number, any, any> implements IRecognitionClient {
1207
+ private static readonly PROTOCOL_VERSION;
1208
+ private config;
1209
+ private audioBuffer;
1210
+ private messageHandler;
1211
+ private state;
1212
+ private connectionPromise;
1213
+ private isDebugLogEnabled;
1214
+ private audioBytesSent;
1215
+ private audioChunksSent;
1216
+ private audioStatsLogInterval;
1217
+ private lastAudioStatsLog;
1218
+ constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig);
1219
+ /**
1220
+ * Internal logging helper - only logs if a logger was provided in config
1221
+ * Debug logs are additionally gated by isDebugLogEnabled flag
1222
+ * @param level - Log level: debug, info, warn, or error
1223
+ * @param message - Message to log
1224
+ * @param data - Optional additional data to log
1225
+ */
1226
+ private log;
1227
+ /**
1228
+ * Clean up internal resources to free memory
1229
+ * Called when connection closes (normally or abnormally)
1230
+ */
1231
+ private cleanup;
1232
+ connect(): Promise<void>;
1233
+ /**
1234
+ * Attempt to connect with retry logic
1235
+ * Only retries on initial connection establishment, not mid-stream interruptions
1236
+ */
1237
+ private connectWithRetry;
1238
+ sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
1239
+ private sendAudioInternal;
1240
+ /**
1241
+ * Only active ehwne client is in READY state. otherwise it will return immediately.
1242
+ * @returns Promise that resolves when the recording is stopped
1243
+ */
1244
+ stopRecording(): Promise<void>;
1245
+ stopAbnormally(): void;
1246
+ getAudioUtteranceId(): string;
1247
+ getUrl(): string;
1248
+ getState(): ClientState;
1249
+ isConnected(): boolean;
1250
+ isConnecting(): boolean;
1251
+ isStopping(): boolean;
1252
+ isTranscriptionFinished(): boolean;
1253
+ isBufferOverflowing(): boolean;
1254
+ getStats(): IRecognitionClientStats;
1255
+ protected onConnected(): void;
1256
+ protected onDisconnected(code: number, reason: string): void;
1257
+ /**
1258
+ * Get human-readable description for WebSocket close code
1259
+ */
1260
+ private getCloseCodeDescription;
1261
+ protected onError(error: Event): void;
1262
+ protected onMessage(msg: {
1263
+ v: number;
1264
+ type: string;
1265
+ data: any;
1266
+ }): void;
1267
+ /**
1268
+ * Handle control messages from server
1269
+ * @param msg - Control message containing server actions
1270
+ */
1271
+ private handleControlMessage;
1272
+ /**
1273
+ * Send audio immediately to the server (without buffering)
1274
+ * @param audioData - Audio data to send
1275
+ */
1276
+ private sendAudioNow;
1277
+ }
1278
+
1279
+ export { AudioEncoding, ControlSignalTypeV1 as ControlSignal, RealTimeTwoWayWebSocketRecognitionClient, RecognitionContextTypeV1 };
1280
+ export type { GameContextV1, RealTimeTwoWayWebSocketRecognitionClientConfig, TranscriptionResult };