@volley/recognition-client-sdk 0.1.200

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,595 @@
1
+ /**
2
+ * RealTimeTwoWayWebSocketRecognitionClient - Clean, compact SDK for real-time speech recognition
3
+ *
4
+ * Features:
5
+ * - Ring buffer-based audio storage with fixed memory footprint
6
+ * - Automatic buffering when disconnected, immediate send when connected
7
+ * - Buffer persists after flush (for future retry/reconnection scenarios)
8
+ * - Built on WebSocketAudioClient for robust protocol handling
9
+ * - Simple API: connect() → sendAudio() → stopRecording()
10
+ * - Type-safe message handling with callbacks
11
+ * - Automatic backpressure management
12
+ * - Overflow detection with buffer state tracking
13
+ *
14
+ * Example:
15
+ * ```typescript
16
+ * const client = new RealTimeTwoWayWebSocketRecognitionClient({
17
+ * url: 'ws://localhost:3101/ws/v1/recognize',
18
+ * onTranscript: (result) => console.log(result.finalTranscript),
19
+ * onError: (error) => console.error(error),
20
+ * maxBufferDurationSec: 60 // Ring buffer for 60 seconds
21
+ * });
22
+ *
23
+ * await client.connect();
24
+ *
25
+ * // Send audio chunks - always stored in ring buffer, sent if connected
26
+ * micStream.on('data', (chunk) => client.sendAudio(chunk));
27
+ *
28
+ * // Signal end of audio and wait for final results
29
+ * await client.stopRecording();
30
+ *
31
+ * // Server will close connection after sending finals
32
+ * // No manual cleanup needed - browser handles it
33
+ * ```
34
+ */
35
+
36
+ import { WebSocketAudioClient } from '@recog/websocket';
37
+ import {
38
+ AudioEncoding,
39
+ RecognitionResultTypeV1,
40
+ ClientControlActionV1,
41
+ RecognitionContextTypeV1,
42
+ ControlSignalTypeV1,
43
+ type TranscriptionResultV1,
44
+ type FunctionCallResultV1,
45
+ type MetadataResultV1,
46
+ type ErrorResultV1,
47
+ type ClientControlMessageV1,
48
+ type ASRRequestConfig,
49
+ type ASRRequestV1,
50
+ type GameContextV1,
51
+ SampleRate
52
+ } from '@recog/shared-types';
53
+ import { v4 as uuidv4 } from 'uuid';
54
+ import { ClientState } from './recognition-client.types.js';
55
+ import type {
56
+ IRecognitionClient,
57
+ IRecognitionClientStats,
58
+ RealTimeTwoWayWebSocketRecognitionClientConfig,
59
+ RecognitionCallbackUrl
60
+ } from './recognition-client.types.js';
61
+ import { buildWebSocketUrl } from './utils/url-builder.js';
62
+ import { AudioRingBuffer } from './utils/audio-ring-buffer.js';
63
+ import { MessageHandler } from './utils/message-handler.js';
64
+
65
+ // ============================================================================
66
+ // UTILITIES
67
+ // ============================================================================
68
+
69
+ /**
70
+ * Check if a WebSocket close code indicates normal closure
71
+ * @param code - WebSocket close code
72
+ * @returns true if the disconnection was normal/expected, false if it was an error
73
+ */
74
+ export function isNormalDisconnection(code: number): boolean {
75
+ return code === 1000; // 1000 is the only "normal" close code
76
+ }
77
+
78
+ // ============================================================================
79
+ // TYPE DEFINITIONS
80
+ // ============================================================================
81
+
82
+ /**
83
+ * Re-export TranscriptionResultV1 as TranscriptionResult for backward compatibility
84
+ */
85
+ export type TranscriptionResult = TranscriptionResultV1;
86
+
87
+ // Re-export config interface from types file for backward compatibility
88
+ export type { RealTimeTwoWayWebSocketRecognitionClientConfig } from './recognition-client.types.js';
89
+
90
+ /**
91
+ * Internal config with processed values and defaults
92
+ */
93
+ interface InternalConfig {
94
+ url: string;
95
+ readonly audioUtteranceId: string; // Immutable - ensures one audio session per client instance
96
+ asrRequestConfig?: ASRRequestConfig;
97
+ gameContext?: GameContextV1;
98
+ callbackUrls?: RecognitionCallbackUrl[];
99
+ onTranscript: (result: TranscriptionResultV1) => void;
100
+ onFunctionCall: (result: FunctionCallResultV1) => void;
101
+ onMetadata: (metadata: MetadataResultV1) => void;
102
+ onError: (error: ErrorResultV1) => void;
103
+ onConnected: () => void;
104
+ onDisconnected: (code: number, reason: string) => void;
105
+ highWaterMark: number;
106
+ lowWaterMark: number;
107
+ maxBufferDurationSec: number;
108
+ chunksPerSecond: number;
109
+ logger?: (level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any) => void;
110
+ }
111
+
112
+ // ============================================================================
113
+ // RECOGNITION CLIENT
114
+ // ============================================================================
115
+
116
+ /**
117
+ * RealTimeTwoWayWebSocketRecognitionClient - SDK-level client for real-time speech recognition
118
+ *
119
+ * Implements IRecognitionClient interface for dependency injection and testing.
120
+ * Extends WebSocketAudioClient with local audio buffering and simple callback-based API.
121
+ */
122
+ export class RealTimeTwoWayWebSocketRecognitionClient
123
+ extends WebSocketAudioClient<number, any, any>
124
+ implements IRecognitionClient
125
+ {
126
+ private static readonly PROTOCOL_VERSION = 1;
127
+
128
+ private config: InternalConfig;
129
+ private audioBuffer: AudioRingBuffer;
130
+ private messageHandler: MessageHandler;
131
+ private state: ClientState = ClientState.INITIAL;
132
+ private connectionPromise: Promise<void> | undefined;
133
+
134
+ // Debug control (internal state, controlled by debugCommand in ASRRequest)
135
+ private isDebugLogEnabled = false;
136
+
137
+ // Stats
138
+ private audioBytesSent = 0;
139
+ private audioChunksSent = 0;
140
+ private audioStatsLogInterval = 100;
141
+ private lastAudioStatsLog = 0;
142
+
143
+ constructor(config: RealTimeTwoWayWebSocketRecognitionClientConfig) {
144
+ // Generate UUID v4 for audioUtteranceId if not provided
145
+ const audioUtteranceId = config.audioUtteranceId || uuidv4();
146
+
147
+ // Build WebSocket URL with query parameters
148
+ const url = buildWebSocketUrl({
149
+ audioUtteranceId,
150
+ ...(config.url && { url: config.url }),
151
+ ...(config.callbackUrls && { callbackUrls: config.callbackUrls }),
152
+ ...(config.userId && { userId: config.userId }),
153
+ ...(config.gameSessionId && { gameSessionId: config.gameSessionId }),
154
+ ...(config.deviceId && { deviceId: config.deviceId }),
155
+ ...(config.accountId && { accountId: config.accountId }),
156
+ ...(config.questionAnswerId && { questionAnswerId: config.questionAnswerId }),
157
+ ...(config.platform && { platform: config.platform }),
158
+ ...(config.gameContext && { gameContext: config.gameContext })
159
+ });
160
+
161
+ // Initialize base WebSocketAudioClient
162
+ super({
163
+ url: url,
164
+ highWM: config.highWaterMark ?? 512_000,
165
+ lowWM: config.lowWaterMark ?? 128_000
166
+ });
167
+
168
+ // Process config with defaults
169
+ this.config = {
170
+ url,
171
+ audioUtteranceId,
172
+ ...(config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig }),
173
+ ...(config.gameContext && { gameContext: config.gameContext }),
174
+ ...(config.callbackUrls && { callbackUrls: config.callbackUrls }),
175
+ onTranscript: config.onTranscript || (() => {}),
176
+ onFunctionCall: config.onFunctionCall || (() => {}),
177
+ onMetadata: config.onMetadata || (() => {}),
178
+ onError: config.onError || (() => {}),
179
+ onConnected: config.onConnected || (() => {}),
180
+ onDisconnected: config.onDisconnected || (() => {}),
181
+ highWaterMark: config.highWaterMark ?? 512_000,
182
+ lowWaterMark: config.lowWaterMark ?? 128_000,
183
+ maxBufferDurationSec: config.maxBufferDurationSec ?? 60,
184
+ chunksPerSecond: config.chunksPerSecond ?? 100,
185
+ ...(config.logger && { logger: config.logger })
186
+ };
187
+
188
+ // Initialize audio buffer
189
+ this.audioBuffer = new AudioRingBuffer({
190
+ maxBufferDurationSec: this.config.maxBufferDurationSec,
191
+ chunksPerSecond: this.config.chunksPerSecond,
192
+ ...(this.config.logger && { logger: this.config.logger })
193
+ });
194
+
195
+ // Initialize message handler
196
+ this.messageHandler = new MessageHandler({
197
+ onTranscript: this.config.onTranscript,
198
+ onFunctionCall: this.config.onFunctionCall,
199
+ onMetadata: this.config.onMetadata,
200
+ onError: this.config.onError,
201
+ onControlMessage: this.handleControlMessage.bind(this),
202
+ ...(this.config.logger && { logger: this.config.logger })
203
+ });
204
+ }
205
+
206
+ // ==========================================================================
207
+ // PRIVATE HELPERS
208
+ // ==========================================================================
209
+
210
+ /**
211
+ * Internal logging helper - only logs if a logger was provided in config
212
+ * Debug logs are additionally gated by isDebugLogEnabled flag
213
+ * @param level - Log level: debug, info, warn, or error
214
+ * @param message - Message to log
215
+ * @param data - Optional additional data to log
216
+ */
217
+ private log(level: 'debug' | 'info' | 'warn' | 'error', message: string, data?: any): void {
218
+ // Skip debug logs if debug logging is not enabled
219
+ if (level === 'debug' && !this.isDebugLogEnabled) {
220
+ return;
221
+ }
222
+
223
+ if (this.config.logger) {
224
+ this.config.logger(level, `[SDK] ${message}`, data);
225
+ }
226
+ }
227
+
228
+ /**
229
+ * Clean up internal resources to free memory
230
+ * Called when connection closes (normally or abnormally)
231
+ */
232
+ private cleanup(): void {
233
+ this.log('debug', 'Cleaning up resources');
234
+
235
+ // Clear audio buffer to free memory
236
+ this.audioBuffer.clear();
237
+
238
+ // Reset stats
239
+ this.audioBytesSent = 0;
240
+ this.audioChunksSent = 0;
241
+ this.lastAudioStatsLog = 0;
242
+
243
+ // Clear connection promise so new connections can be made
244
+ this.connectionPromise = undefined;
245
+ }
246
+
247
+ // ==========================================================================
248
+ // PUBLIC API
249
+ // ==========================================================================
250
+
251
+ override async connect(): Promise<void> {
252
+ // FIRST: Check if we already have a connection promise (handles simultaneous calls)
253
+ if (this.connectionPromise) {
254
+ this.log('debug', 'Returning existing connection promise', {
255
+ state: this.state,
256
+ hasPromise: true
257
+ });
258
+ return this.connectionPromise;
259
+ }
260
+
261
+ // SECOND: Check state machine - prevent connections in wrong states
262
+ if (
263
+ this.state !== ClientState.INITIAL &&
264
+ this.state !== ClientState.FAILED &&
265
+ this.state !== ClientState.STOPPED
266
+ ) {
267
+ this.log('debug', 'Already connected or in wrong state', {
268
+ state: this.state
269
+ });
270
+ // If we're already connected/ready, return resolved promise
271
+ return Promise.resolve();
272
+ }
273
+
274
+ // RETRY HINT: Wrap this method with exponential backoff (e.g., 1s, 2s, 4s) on FAILED state
275
+ // Ensure audioBuffer persists between retries - same audioUtteranceId = same audio session
276
+
277
+ this.log('debug', 'Creating new connection to WebSocket', { url: this.config.url });
278
+ this.state = ClientState.CONNECTING;
279
+
280
+ const connectionStartTime = Date.now();
281
+
282
+ // Store the promise IMMEDIATELY so simultaneous calls will get the same promise
283
+ this.connectionPromise = new Promise((resolve, reject) => {
284
+ const timeout = setTimeout(() => {
285
+ this.log('warn', 'Connection timeout', { timeout: 10000 });
286
+ this.state = ClientState.FAILED;
287
+ reject(new Error('Timeout'));
288
+ }, 10000);
289
+
290
+ const originalOnConnected = this.onConnected.bind(this);
291
+ this.onConnected = (): void => {
292
+ clearTimeout(timeout);
293
+ const connectionTime = Date.now() - connectionStartTime;
294
+ this.log('debug', 'Connection established successfully', {
295
+ connectionTimeMs: connectionTime,
296
+ url: this.config.url
297
+ });
298
+ this.state = ClientState.CONNECTED;
299
+ originalOnConnected();
300
+ resolve();
301
+ };
302
+
303
+ const originalOnError = this.onError.bind(this);
304
+ this.onError = (error): void => {
305
+ clearTimeout(timeout);
306
+ this.log('warn', 'Connection error', error);
307
+ this.state = ClientState.FAILED;
308
+ originalOnError(error);
309
+ reject(error);
310
+ };
311
+
312
+ super.connect();
313
+ });
314
+
315
+ return this.connectionPromise;
316
+ }
317
+
318
+ override sendAudio(audioData: ArrayBuffer | ArrayBufferView): void {
319
+ const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
320
+ if (bytes === 0) return;
321
+
322
+ // BACKPRESSURE HINT: Return false or throw if audioBuffer.write() returns false (overflow)
323
+ // Caller should pause audio capture until buffer has space (check isBufferOverflowing())
324
+
325
+ // Always write to ring buffer
326
+ this.audioBuffer.write(audioData);
327
+
328
+ // Send immediately if ready and not backpressured
329
+ if (this.state === ClientState.READY && !super.isLocalBackpressured()) {
330
+ this.log('debug', 'Sending audio immediately', { bytes });
331
+ this.sendAudioNow(audioData);
332
+ this.audioBuffer.read(); // Remove from buffer since we sent it
333
+ } else {
334
+ this.log('debug', 'Buffering audio', {
335
+ bytes,
336
+ state: this.state,
337
+ backpressured: super.isLocalBackpressured()
338
+ });
339
+ }
340
+
341
+ // Log audio stats periodically (only if debug logging is enabled)
342
+ if (this.isDebugLogEnabled) {
343
+ const totalChunks = this.audioChunksSent + this.audioBuffer.getStats().chunksBuffered;
344
+ if (totalChunks - this.lastAudioStatsLog >= this.audioStatsLogInterval) {
345
+ const stats = this.audioBuffer.getStats();
346
+ this.log('debug', 'Audio statistics', {
347
+ totalBytesSent: this.audioBytesSent,
348
+ totalChunksSent: this.audioChunksSent,
349
+ ...stats
350
+ });
351
+ this.lastAudioStatsLog = totalChunks;
352
+ }
353
+ }
354
+ }
355
+
356
+ async stopRecording(): Promise<void> {
357
+ if (this.state !== ClientState.READY) {
358
+ this.log('warn', 'Cannot stop recording - not in READY state', { state: this.state });
359
+ return;
360
+ }
361
+
362
+ this.log('debug', 'Stopping recording');
363
+ this.state = ClientState.STOPPING;
364
+
365
+ super.sendMessage(RealTimeTwoWayWebSocketRecognitionClient.PROTOCOL_VERSION, 'message', {
366
+ type: RecognitionContextTypeV1.CONTROL_SIGNAL,
367
+ signal: ControlSignalTypeV1.STOP_RECORDING
368
+ });
369
+
370
+ return new Promise((resolve) => {
371
+ const timeout = setTimeout(() => {
372
+ this.state = ClientState.STOPPED;
373
+ resolve();
374
+ }, 5000);
375
+
376
+ const original = this.config.onTranscript;
377
+ this.config.onTranscript = (result): void => {
378
+ original(result);
379
+ if (result.is_finished) {
380
+ clearTimeout(timeout);
381
+ this.state = ClientState.STOPPED;
382
+ resolve();
383
+ }
384
+ };
385
+
386
+ // CRITICAL: Update MessageHandler's callback to use the wrapped version
387
+ // Otherwise it will keep calling the original and never detect is_finished
388
+ (this.messageHandler as any).callbacks.onTranscript = this.config.onTranscript;
389
+ });
390
+ }
391
+
392
+
393
+ getAudioUtteranceId(): string {
394
+ return this.config.audioUtteranceId;
395
+ }
396
+
397
+ getState(): ClientState {
398
+ return this.state;
399
+ }
400
+
401
+ isConnected(): boolean {
402
+ return this.state === ClientState.READY;
403
+ }
404
+
405
+ isConnecting(): boolean {
406
+ return this.state === ClientState.CONNECTING;
407
+ }
408
+
409
+ isStopping(): boolean {
410
+ return this.state === ClientState.STOPPING;
411
+ }
412
+
413
+ isTranscriptionFinished(): boolean {
414
+ return this.state === ClientState.STOPPED;
415
+ }
416
+
417
+ isBufferOverflowing(): boolean {
418
+ return this.audioBuffer.isOverflowing();
419
+ }
420
+
421
+ getStats(): IRecognitionClientStats {
422
+ const bufferStats = this.audioBuffer.getStats();
423
+ return {
424
+ audioBytesSent: this.audioBytesSent,
425
+ audioChunksSent: this.audioChunksSent,
426
+ audioChunksBuffered: bufferStats.chunksBuffered,
427
+ bufferOverflowCount: bufferStats.overflowCount,
428
+ currentBufferedChunks: bufferStats.currentBufferedChunks,
429
+ hasWrapped: bufferStats.hasWrapped
430
+ };
431
+ }
432
+
433
+ // ==========================================================================
434
+ // WEBSOCKET HOOKS (from WebSocketAudioClient)
435
+ // ==========================================================================
436
+
437
+ protected onConnected(): void {
438
+ this.log('debug', 'WebSocket onConnected callback');
439
+
440
+ // Send ASRRequest with configuration (if provided)
441
+ if (this.config.asrRequestConfig) {
442
+ // Extract debugCommand if present (with type safety for new field)
443
+ const debugCommand = (this.config.asrRequestConfig as any).debugCommand;
444
+ if (debugCommand?.enableDebugLog) {
445
+ this.isDebugLogEnabled = true;
446
+ this.log('debug', 'Debug logging enabled via debugCommand');
447
+ }
448
+
449
+ // Only generate debug log data if debug logging is enabled
450
+ if (this.isDebugLogEnabled) {
451
+ this.log('debug', 'Sending ASR request', this.config.asrRequestConfig);
452
+ }
453
+
454
+ const asrRequest: ASRRequestV1 = {
455
+ type: RecognitionContextTypeV1.ASR_REQUEST,
456
+ audioUtteranceId: this.config.audioUtteranceId,
457
+ provider: this.config.asrRequestConfig.provider.toString(),
458
+ model: this.config.asrRequestConfig.model,
459
+ language: this.config.asrRequestConfig.language?.toString() || 'en',
460
+ sampleRate:
461
+ typeof this.config.asrRequestConfig.sampleRate === 'number'
462
+ ? this.config.asrRequestConfig.sampleRate
463
+ : SampleRate.RATE_16000,
464
+ encoding:
465
+ typeof this.config.asrRequestConfig.encoding === 'number'
466
+ ? this.config.asrRequestConfig.encoding
467
+ : AudioEncoding.LINEAR16,
468
+ interimResults: this.config.asrRequestConfig.interimResults ?? false,
469
+ // Auto-enable useContext if gameContext is provided, or use explicit value if set
470
+ useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
471
+ ...(debugCommand && { debugCommand })
472
+ };
473
+
474
+ super.sendMessage(
475
+ RealTimeTwoWayWebSocketRecognitionClient.PROTOCOL_VERSION,
476
+ 'message',
477
+ asrRequest
478
+ );
479
+ }
480
+
481
+ // Send GameContext if provided
482
+ if (this.config.gameContext) {
483
+ // Only pass gameContext object to log if debug logging is enabled
484
+ if (this.isDebugLogEnabled) {
485
+ this.log('debug', 'Sending game context', this.config.gameContext);
486
+ }
487
+ super.sendMessage(
488
+ RealTimeTwoWayWebSocketRecognitionClient.PROTOCOL_VERSION,
489
+ 'message',
490
+ this.config.gameContext
491
+ );
492
+ }
493
+
494
+ this.log('debug', 'Waiting for server ready signal');
495
+ this.config.onConnected();
496
+ }
497
+
498
+ protected onDisconnected(code: number, reason: string): void {
499
+ this.log('debug', 'WebSocket disconnected', { code, reason, previousState: this.state });
500
+
501
+ // Update state based on disconnection type
502
+ if (this.state === ClientState.STOPPING) {
503
+ this.state = ClientState.STOPPED;
504
+ } else if (
505
+ this.state === ClientState.CONNECTED ||
506
+ this.state === ClientState.READY ||
507
+ this.state === ClientState.CONNECTING
508
+ ) {
509
+ this.log('error', 'Unexpected disconnection', { code, reason });
510
+ this.state = ClientState.FAILED;
511
+ }
512
+
513
+ // Clean up memory proactively when connection closes
514
+ this.cleanup();
515
+
516
+ this.config.onDisconnected(code, reason);
517
+ }
518
+
519
+ protected onError(error: Event): void {
520
+ this.state = ClientState.FAILED;
521
+
522
+ const errorResult: ErrorResultV1 = {
523
+ type: RecognitionResultTypeV1.ERROR,
524
+ audioUtteranceId: '',
525
+ message: 'WebSocket error',
526
+ description: error.type || 'Connection error'
527
+ };
528
+ this.config.onError(errorResult);
529
+ }
530
+
531
+ protected override onMessage(msg: { v: number; type: string; data: any }): void {
532
+ this.messageHandler.handleMessage(msg);
533
+ }
534
+
535
+ // ==========================================================================
536
+ // INTERNAL HELPERS
537
+ // ==========================================================================
538
+
539
+ /**
540
+ * Handle control messages from server
541
+ * @param msg - Control message containing server actions
542
+ */
543
+ private handleControlMessage(msg: ClientControlMessageV1): void {
544
+ switch (msg.action) {
545
+ case ClientControlActionV1.READY_FOR_UPLOADING_RECORDING: {
546
+ this.log('debug', 'Server ready for audio upload');
547
+ this.state = ClientState.READY;
548
+ this.messageHandler.setSessionStartTime(Date.now());
549
+
550
+ // Flush buffered audio now that server is ready
551
+ const bufferedChunks = this.audioBuffer.flush();
552
+ if (bufferedChunks.length > 0) {
553
+ this.log('debug', 'Flushing buffered audio', { chunks: bufferedChunks.length });
554
+ bufferedChunks.forEach((chunk) => this.sendAudioNow(chunk.data));
555
+ }
556
+ break;
557
+ }
558
+
559
+ case ClientControlActionV1.STOP_RECORDING:
560
+ this.log('debug', 'Received stop recording signal from server');
561
+ break;
562
+
563
+ default:
564
+ this.log('warn', 'Unknown control action', { action: msg.action });
565
+ }
566
+ }
567
+
568
+ /**
569
+ * Send audio immediately to the server (without buffering)
570
+ * @param audioData - Audio data to send
571
+ */
572
+ private sendAudioNow(audioData: ArrayBuffer | ArrayBufferView): void {
573
+ const byteLength = ArrayBuffer.isView(audioData)
574
+ ? audioData.byteLength
575
+ : audioData.byteLength;
576
+
577
+ const encodingId = (this.config.asrRequestConfig?.encoding ||
578
+ AudioEncoding.LINEAR16) as AudioEncoding;
579
+
580
+ const sampleRate =
581
+ typeof this.config.asrRequestConfig?.sampleRate === 'number'
582
+ ? this.config.asrRequestConfig.sampleRate
583
+ : SampleRate.RATE_16000;
584
+
585
+ super.sendAudio(
586
+ audioData,
587
+ RealTimeTwoWayWebSocketRecognitionClient.PROTOCOL_VERSION,
588
+ encodingId,
589
+ sampleRate
590
+ );
591
+
592
+ this.audioBytesSent += byteLength;
593
+ this.audioChunksSent++;
594
+ }
595
+ }