@volley/recognition-client-sdk 0.1.782 → 0.1.800
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +75 -4
- package/dist/index.bundled.d.ts +198 -87
- package/dist/index.js +191 -20
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +95 -4
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +32 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +22 -85
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/dist/vgf-recognition-mapper.d.ts +9 -17
- package/dist/vgf-recognition-mapper.d.ts.map +1 -1
- package/dist/vgf-recognition-state.d.ts +103 -0
- package/dist/vgf-recognition-state.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/index.spec.ts +2 -0
- package/src/recognition-client.ts +65 -7
- package/src/recognition-client.types.ts +37 -0
- package/src/simplified-vgf-recognition-client.spec.ts +0 -27
- package/src/simplified-vgf-recognition-client.ts +97 -127
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
- package/src/vgf-recognition-mapper.spec.ts +143 -0
- package/src/vgf-recognition-mapper.ts +35 -45
- package/src/vgf-recognition-state.ts +19 -1
|
@@ -223,6 +223,11 @@ export interface IRecognitionClientConfig {
|
|
|
223
223
|
*
|
|
224
224
|
* Main interface for real-time speech recognition clients.
|
|
225
225
|
* Provides methods for connection management, audio streaming, and session control.
|
|
226
|
+
*
|
|
227
|
+
* NOTE for maintainers: `ISimplifiedVGFRecognitionClient` extends this interface,
|
|
228
|
+
* so any method added here must also be implemented (typically as a delegate) by
|
|
229
|
+
* `SimplifiedVGFRecognitionClient`. TypeScript will flag missing delegates at
|
|
230
|
+
* compile time — do not work around the error, add the delegate.
|
|
226
231
|
*/
|
|
227
232
|
export interface IRecognitionClient {
|
|
228
233
|
/**
|
|
@@ -239,6 +244,38 @@ export interface IRecognitionClient {
|
|
|
239
244
|
*/
|
|
240
245
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
241
246
|
|
|
247
|
+
/**
|
|
248
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
249
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
250
|
+
* server validator) before transmitting.
|
|
251
|
+
*
|
|
252
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
253
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
254
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
255
|
+
* skip the resample step.
|
|
256
|
+
*
|
|
257
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
258
|
+
* mixed to mono by the caller.
|
|
259
|
+
*
|
|
260
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
261
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
262
|
+
*/
|
|
263
|
+
sendAudioWithSampleRate(
|
|
264
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
265
|
+
sourceSampleRate: number
|
|
266
|
+
): void;
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Send prefix audio (e.g. a TTS prompt) that primes the provider's language
|
|
270
|
+
* model before user audio is streamed. Chunks accepted — the server buffers
|
|
271
|
+
* until the session is READY and flushes. Must be sent BEFORE the first
|
|
272
|
+
* `sendAudio()` to take effect. Only meaningful when
|
|
273
|
+
* `asrRequestConfig.prefixMode === PrefixMode.CLIENT`.
|
|
274
|
+
*
|
|
275
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
276
|
+
*/
|
|
277
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
278
|
+
|
|
242
279
|
/**
|
|
243
280
|
* Stop recording and wait for final transcript
|
|
244
281
|
* The server will close the connection after sending the final transcript.
|
|
@@ -766,33 +766,6 @@ describe('SimplifiedVGFRecognitionClient', () => {
|
|
|
766
766
|
});
|
|
767
767
|
});
|
|
768
768
|
|
|
769
|
-
it('should warn if promptSlotMap exists but no gameContext provided', () => {
|
|
770
|
-
const logger = jest.fn();
|
|
771
|
-
const initialState: RecognitionState = {
|
|
772
|
-
audioUtteranceId: 'test-123',
|
|
773
|
-
pendingTranscript: '', // Required field
|
|
774
|
-
promptSlotMap: {
|
|
775
|
-
'entity1': ['value1']
|
|
776
|
-
}
|
|
777
|
-
};
|
|
778
|
-
|
|
779
|
-
simplifiedClient = new SimplifiedVGFRecognitionClient({
|
|
780
|
-
asrRequestConfig: {
|
|
781
|
-
provider: 'deepgram',
|
|
782
|
-
language: 'en',
|
|
783
|
-
sampleRate: 16000,
|
|
784
|
-
encoding: AudioEncoding.LINEAR16
|
|
785
|
-
},
|
|
786
|
-
initialState,
|
|
787
|
-
logger
|
|
788
|
-
});
|
|
789
|
-
|
|
790
|
-
expect(logger).toHaveBeenCalledWith(
|
|
791
|
-
'warn',
|
|
792
|
-
'[VGF] promptSlotMap found but no gameContext provided. SlotMap will not be sent.'
|
|
793
|
-
);
|
|
794
|
-
});
|
|
795
|
-
|
|
796
769
|
it('should preserve promptSlotMap throughout state changes', () => {
|
|
797
770
|
const initialState: RecognitionState = {
|
|
798
771
|
audioUtteranceId: 'test-123',
|
|
@@ -17,17 +17,19 @@ import {
|
|
|
17
17
|
import {
|
|
18
18
|
IRecognitionClient,
|
|
19
19
|
IRecognitionClientConfig,
|
|
20
|
+
IRecognitionClientStats,
|
|
20
21
|
ClientState
|
|
21
22
|
} from './recognition-client.types.js';
|
|
22
23
|
import { RealTimeTwoWayWebSocketRecognitionClient } from './recognition-client.js';
|
|
23
24
|
import {
|
|
24
25
|
createVGFStateFromConfig,
|
|
25
26
|
mapTranscriptionResultToState,
|
|
27
|
+
mapSessionConfiguredToState,
|
|
26
28
|
mapErrorToState,
|
|
27
29
|
updateStateOnStop,
|
|
28
30
|
resetRecognitionVGFState
|
|
29
31
|
} from './vgf-recognition-mapper.js';
|
|
30
|
-
import {
|
|
32
|
+
import type { GameContextV1 } from '@recog/shared-types';
|
|
31
33
|
|
|
32
34
|
/**
|
|
33
35
|
* Configuration for SimplifiedVGFRecognitionClient
|
|
@@ -49,114 +51,22 @@ export interface SimplifiedVGFClientConfig extends IRecognitionClientConfig {
|
|
|
49
51
|
/**
|
|
50
52
|
* Interface for SimplifiedVGFRecognitionClient
|
|
51
53
|
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
54
|
+
* Inherits the full IRecognitionClient surface (connect, sendAudio,
|
|
55
|
+
* sendAudioWithSampleRate, sendPrefixAudio, stopRecording, stopAbnormally,
|
|
56
|
+
* status checks, sendGameContext, getStats, getUrl, getState, getAudioUtteranceId)
|
|
57
|
+
* — see recognition-client.types.ts for those. Adds VGF-specific state access.
|
|
58
|
+
*
|
|
59
|
+
* Extending IRecognitionClient (rather than redeclaring methods) means
|
|
60
|
+
* TypeScript catches any base-client method that's not delegated by the
|
|
61
|
+
* VGF wrapper at compile time — keeps the two surfaces in sync.
|
|
54
62
|
*/
|
|
55
|
-
export interface ISimplifiedVGFRecognitionClient {
|
|
56
|
-
// ============= Core Connection Methods =============
|
|
57
|
-
/**
|
|
58
|
-
* Connect to the recognition service WebSocket
|
|
59
|
-
* @returns Promise that resolves when connected and ready
|
|
60
|
-
*/
|
|
61
|
-
connect(): Promise<void>;
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Send audio data for transcription
|
|
65
|
-
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
66
|
-
*/
|
|
67
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* Stop recording and wait for final transcription
|
|
71
|
-
* @returns Promise that resolves when transcription is complete
|
|
72
|
-
*/
|
|
73
|
-
stopRecording(): Promise<void>;
|
|
74
|
-
|
|
63
|
+
export interface ISimplifiedVGFRecognitionClient extends IRecognitionClient {
|
|
75
64
|
/**
|
|
76
|
-
*
|
|
77
|
-
*
|
|
78
|
-
* WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
|
|
79
|
-
* - Does NOT wait for server to process remaining audio
|
|
80
|
-
* - Does NOT receive final transcript from server (VGF state set to empty)
|
|
81
|
-
* - Immediately closes WebSocket connection
|
|
82
|
-
* - Cleans up resources (buffers, listeners)
|
|
83
|
-
*
|
|
84
|
-
* Use Cases:
|
|
85
|
-
* - User explicitly cancels/abandons the session
|
|
86
|
-
* - Timeout scenarios where waiting is not acceptable
|
|
87
|
-
* - Need immediate cleanup and can't wait for server
|
|
88
|
-
*
|
|
89
|
-
* RECOMMENDED: Use stopRecording() for normal shutdown.
|
|
90
|
-
* Only use this when immediate disconnection is required.
|
|
91
|
-
*/
|
|
92
|
-
stopAbnormally(): void;
|
|
93
|
-
|
|
94
|
-
// ============= VGF State Methods =============
|
|
95
|
-
/**
|
|
96
|
-
* Get the current VGF recognition state
|
|
65
|
+
* Get the current VGF recognition state — the single shared store
|
|
66
|
+
* of inputs and outputs for this utterance.
|
|
97
67
|
* @returns Current RecognitionState with all transcription data
|
|
98
68
|
*/
|
|
99
69
|
getVGFState(): RecognitionState;
|
|
100
|
-
|
|
101
|
-
// ============= Status Check Methods =============
|
|
102
|
-
/**
|
|
103
|
-
* Check if connected to the WebSocket
|
|
104
|
-
*/
|
|
105
|
-
isConnected(): boolean;
|
|
106
|
-
|
|
107
|
-
/**
|
|
108
|
-
* Check if currently connecting
|
|
109
|
-
*/
|
|
110
|
-
isConnecting(): boolean;
|
|
111
|
-
|
|
112
|
-
/**
|
|
113
|
-
* Check if currently stopping
|
|
114
|
-
*/
|
|
115
|
-
isStopping(): boolean;
|
|
116
|
-
|
|
117
|
-
/**
|
|
118
|
-
* Check if transcription has finished
|
|
119
|
-
*/
|
|
120
|
-
isTranscriptionFinished(): boolean;
|
|
121
|
-
|
|
122
|
-
/**
|
|
123
|
-
* Check if the audio buffer has overflowed
|
|
124
|
-
*/
|
|
125
|
-
isBufferOverflowing(): boolean;
|
|
126
|
-
|
|
127
|
-
// ============= Preconnect Methods =============
|
|
128
|
-
/**
|
|
129
|
-
* Send game context after connection is established (for preconnect flow).
|
|
130
|
-
*
|
|
131
|
-
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
132
|
-
* WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
|
|
133
|
-
*
|
|
134
|
-
* @param context - Game context including slotMap for keyword boosting
|
|
135
|
-
*/
|
|
136
|
-
sendGameContext(context: GameContextV1): void;
|
|
137
|
-
|
|
138
|
-
/**
|
|
139
|
-
* Check if server has sent READY signal (provider connected, ready for audio).
|
|
140
|
-
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
141
|
-
*/
|
|
142
|
-
isServerReady(): boolean;
|
|
143
|
-
|
|
144
|
-
// ============= Utility Methods =============
|
|
145
|
-
/**
|
|
146
|
-
* Get the audio utterance ID for this session
|
|
147
|
-
*/
|
|
148
|
-
getAudioUtteranceId(): string;
|
|
149
|
-
|
|
150
|
-
/**
|
|
151
|
-
* Get the WebSocket URL being used
|
|
152
|
-
*/
|
|
153
|
-
getUrl(): string;
|
|
154
|
-
|
|
155
|
-
/**
|
|
156
|
-
* Get the underlying client state (for advanced usage)
|
|
157
|
-
*/
|
|
158
|
-
getState(): ClientState;
|
|
159
|
-
|
|
160
70
|
}
|
|
161
71
|
|
|
162
72
|
/**
|
|
@@ -229,23 +139,38 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
229
139
|
// Track the expected UUID for this session
|
|
230
140
|
this.expectedUuid = this.state.audioUtteranceId;
|
|
231
141
|
|
|
232
|
-
//
|
|
233
|
-
|
|
142
|
+
// Forward optional VGF inputs (promptSlotMap, promptSTT/STF/TTF) into the GameContext.
|
|
143
|
+
const hasPromptInputs =
|
|
144
|
+
this.state.promptSlotMap !== undefined ||
|
|
145
|
+
this.state.promptSTT !== undefined ||
|
|
146
|
+
this.state.promptSTF !== undefined ||
|
|
147
|
+
this.state.promptTTF !== undefined;
|
|
148
|
+
|
|
149
|
+
if (hasPromptInputs) {
|
|
234
150
|
// Set useContext=true in ASR config to enable context processing
|
|
235
151
|
if (clientConfig.asrRequestConfig) {
|
|
236
152
|
clientConfig.asrRequestConfig.useContext = true;
|
|
237
153
|
}
|
|
238
154
|
|
|
239
|
-
// Add promptSlotMap to gameContext
|
|
240
155
|
if (!clientConfig.gameContext) {
|
|
241
156
|
// Only create gameContext if we have gameId and gamePhase
|
|
242
157
|
// These should come from the game's configuration
|
|
243
158
|
if (clientConfig.logger) {
|
|
244
|
-
clientConfig.logger('warn', '[VGF]
|
|
159
|
+
clientConfig.logger('warn', '[VGF] prompt inputs found but no gameContext provided. They will not be sent.');
|
|
245
160
|
}
|
|
246
161
|
} else {
|
|
247
|
-
|
|
248
|
-
|
|
162
|
+
if (this.state.promptSlotMap !== undefined) {
|
|
163
|
+
clientConfig.gameContext.slotMap = this.state.promptSlotMap;
|
|
164
|
+
}
|
|
165
|
+
if (this.state.promptSTT !== undefined) {
|
|
166
|
+
clientConfig.gameContext.promptSTT = this.state.promptSTT;
|
|
167
|
+
}
|
|
168
|
+
if (this.state.promptSTF !== undefined) {
|
|
169
|
+
clientConfig.gameContext.promptSTF = this.state.promptSTF;
|
|
170
|
+
}
|
|
171
|
+
if (this.state.promptTTF !== undefined) {
|
|
172
|
+
clientConfig.gameContext.promptTTF = this.state.promptTTF;
|
|
173
|
+
}
|
|
249
174
|
}
|
|
250
175
|
}
|
|
251
176
|
|
|
@@ -254,7 +179,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
254
179
|
...clientConfig,
|
|
255
180
|
|
|
256
181
|
// These callbacks ONLY update the VGF state sink
|
|
257
|
-
onTranscript: (result) => {
|
|
182
|
+
onTranscript: (result): void => {
|
|
258
183
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
259
184
|
if (result.audioUtteranceId && result.audioUtteranceId !== this.expectedUuid) {
|
|
260
185
|
if (this.logger) {
|
|
@@ -275,7 +200,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
275
200
|
}
|
|
276
201
|
},
|
|
277
202
|
|
|
278
|
-
onMetadata: (metadata) => {
|
|
203
|
+
onMetadata: (metadata): void => {
|
|
279
204
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
280
205
|
if (metadata.audioUtteranceId && metadata.audioUtteranceId !== this.expectedUuid) {
|
|
281
206
|
if (this.logger) {
|
|
@@ -291,14 +216,33 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
291
216
|
}
|
|
292
217
|
},
|
|
293
218
|
|
|
294
|
-
|
|
219
|
+
onSessionConfigured: (sessionConfigured): void => {
|
|
220
|
+
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
221
|
+
if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
|
|
222
|
+
if (this.logger) {
|
|
223
|
+
this.logger('warn',
|
|
224
|
+
`[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
|
|
225
|
+
);
|
|
226
|
+
}
|
|
227
|
+
return;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
|
|
231
|
+
this.notifyStateChange();
|
|
232
|
+
|
|
233
|
+
if (clientConfig.onSessionConfigured) {
|
|
234
|
+
clientConfig.onSessionConfigured(sessionConfigured);
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
|
|
238
|
+
onFunctionCall: (result): void => {
|
|
295
239
|
// Pass through function call - no VGF state changes needed for P2 feature
|
|
296
240
|
if (clientConfig.onFunctionCall) {
|
|
297
241
|
clientConfig.onFunctionCall(result);
|
|
298
242
|
}
|
|
299
243
|
},
|
|
300
244
|
|
|
301
|
-
onError: (error) => {
|
|
245
|
+
onError: (error): void => {
|
|
302
246
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
303
247
|
if (error.audioUtteranceId && error.audioUtteranceId !== this.expectedUuid) {
|
|
304
248
|
if (this.logger) {
|
|
@@ -310,7 +254,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
310
254
|
}
|
|
311
255
|
|
|
312
256
|
this.isRecordingAudio = false; // Reset on error
|
|
313
|
-
this.state = mapErrorToState(this.state
|
|
257
|
+
this.state = mapErrorToState(this.state);
|
|
314
258
|
this.notifyStateChange();
|
|
315
259
|
|
|
316
260
|
if (clientConfig.onError) {
|
|
@@ -318,14 +262,14 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
318
262
|
}
|
|
319
263
|
},
|
|
320
264
|
|
|
321
|
-
onConnected: () => {
|
|
265
|
+
onConnected: (): void => {
|
|
322
266
|
// Don't update READY here - client can accept audio before connection
|
|
323
267
|
if (clientConfig.onConnected) {
|
|
324
268
|
clientConfig.onConnected();
|
|
325
269
|
}
|
|
326
270
|
},
|
|
327
271
|
|
|
328
|
-
onDisconnected: (code, reason) => {
|
|
272
|
+
onDisconnected: (code, reason): void => {
|
|
329
273
|
this.isRecordingAudio = false; // Reset on disconnect
|
|
330
274
|
if (clientConfig.onDisconnected) {
|
|
331
275
|
clientConfig.onDisconnected(code, reason);
|
|
@@ -343,19 +287,45 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
343
287
|
}
|
|
344
288
|
|
|
345
289
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void {
|
|
346
|
-
|
|
347
|
-
if (!this.isRecordingAudio) {
|
|
348
|
-
this.isRecordingAudio = true;
|
|
349
|
-
this.state = {
|
|
350
|
-
...this.state,
|
|
351
|
-
startRecordingStatus: 'RECORDING',
|
|
352
|
-
startRecordingTimestamp: new Date().toISOString()
|
|
353
|
-
};
|
|
354
|
-
this.notifyStateChange();
|
|
355
|
-
}
|
|
290
|
+
this.markRecordingStarted();
|
|
356
291
|
this.client.sendAudio(audioData);
|
|
357
292
|
}
|
|
358
293
|
|
|
294
|
+
sendAudioWithSampleRate(
|
|
295
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
296
|
+
sourceSampleRate: number
|
|
297
|
+
): void {
|
|
298
|
+
this.markRecordingStarted();
|
|
299
|
+
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void {
|
|
303
|
+
// Pure pass-through. Prefix audio is NOT user audio, so do not flip
|
|
304
|
+
// startRecordingStatus to RECORDING — that transition belongs to the
|
|
305
|
+
// first sendAudio() call.
|
|
306
|
+
this.client.sendPrefixAudio(audioData);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
getStats(): IRecognitionClientStats {
|
|
310
|
+
return this.client.getStats();
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
/**
|
|
314
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
315
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
316
|
+
* `isRecordingAudio`.
|
|
317
|
+
*/
|
|
318
|
+
private markRecordingStarted(): void {
|
|
319
|
+
if (this.isRecordingAudio) return;
|
|
320
|
+
this.isRecordingAudio = true;
|
|
321
|
+
this.state = {
|
|
322
|
+
...this.state,
|
|
323
|
+
startRecordingStatus: 'RECORDING',
|
|
324
|
+
startRecordingTimestamp: new Date().toISOString()
|
|
325
|
+
};
|
|
326
|
+
this.notifyStateChange();
|
|
327
|
+
}
|
|
328
|
+
|
|
359
329
|
async stopRecording(): Promise<void> {
|
|
360
330
|
this.isRecordingAudio = false;
|
|
361
331
|
this.state = updateStateOnStop(this.state);
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { downsamplePcm16 } from './audio-resampler.js';
|
|
2
|
+
|
|
3
|
+
// Helper: build an Int16Array from numbers and return its backing buffer.
|
|
4
|
+
const buf = (samples: number[]): ArrayBuffer => new Int16Array(samples).slice().buffer;
|
|
5
|
+
const samples = (b: ArrayBuffer): number[] => Array.from(new Int16Array(b));
|
|
6
|
+
|
|
7
|
+
describe('downsamplePcm16', () => {
|
|
8
|
+
it('returns a defensive copy when srcRate === targetRate', () => {
|
|
9
|
+
const input = new Int16Array([100, 200, 300, 400]);
|
|
10
|
+
const out = downsamplePcm16(input, 16000, 16000);
|
|
11
|
+
expect(samples(out)).toEqual([100, 200, 300, 400]);
|
|
12
|
+
// Mutating the input must not affect the returned buffer.
|
|
13
|
+
input[0] = 9999;
|
|
14
|
+
expect(samples(out)[0]).toBe(100);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it('integer ratio 48000 → 16000 averages every 3 source samples', () => {
|
|
18
|
+
// 9 source samples → 3 output samples (ratio = 3)
|
|
19
|
+
const input = buf([0, 6, 12, 30, 60, 90, -3, -6, -9]);
|
|
20
|
+
const out = downsamplePcm16(input, 48000, 16000);
|
|
21
|
+
expect(samples(out)).toEqual([
|
|
22
|
+
Math.round((0 + 6 + 12) / 3), // 6
|
|
23
|
+
Math.round((30 + 60 + 90) / 3), // 60
|
|
24
|
+
Math.round((-3 + -6 + -9) / 3), // -6
|
|
25
|
+
]);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it('fractional ratio 44100 → 16000 produces ~44100/16000 output samples', () => {
|
|
29
|
+
// 441 source samples at 44.1kHz ≈ 10ms; expect ~160 output samples at 16kHz.
|
|
30
|
+
const input = new Int16Array(441).fill(1000);
|
|
31
|
+
const out = downsamplePcm16(input, 44100, 16000);
|
|
32
|
+
const outArr = new Int16Array(out);
|
|
33
|
+
expect(outArr.length).toBe(Math.floor(441 / (44100 / 16000))); // = 160
|
|
34
|
+
// Constant input should produce constant output (within rounding).
|
|
35
|
+
for (let i = 0; i < outArr.length; i++) {
|
|
36
|
+
expect(outArr[i]).toBe(1000);
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('accepts ArrayBuffer input as well as typed-array view', () => {
|
|
41
|
+
const view = new Int16Array([10, 20, 30, 40, 50, 60]);
|
|
42
|
+
const fromView = samples(downsamplePcm16(view, 48000, 16000));
|
|
43
|
+
const fromBuf = samples(downsamplePcm16(view.buffer, 48000, 16000));
|
|
44
|
+
expect(fromBuf).toEqual(fromView);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('handles a typed-array view that shares a larger backing buffer', () => {
|
|
48
|
+
// Simulate a slice from a bigger capture buffer — only the view's bytes
|
|
49
|
+
// should be considered, not the rest of the underlying ArrayBuffer.
|
|
50
|
+
const big = new Int16Array([99, 99, 0, 6, 12, 30, 60, 90, 99, 99]);
|
|
51
|
+
const slice = new Int16Array(big.buffer, 2 * Int16Array.BYTES_PER_ELEMENT, 6);
|
|
52
|
+
const out = downsamplePcm16(slice, 48000, 16000);
|
|
53
|
+
expect(samples(out)).toEqual([
|
|
54
|
+
Math.round((0 + 6 + 12) / 3),
|
|
55
|
+
Math.round((30 + 60 + 90) / 3),
|
|
56
|
+
]);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('returns an empty buffer for empty input', () => {
|
|
60
|
+
const out = downsamplePcm16(new Int16Array(0), 48000, 16000);
|
|
61
|
+
expect(new Int16Array(out).length).toBe(0);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('throws when asked to upsample', () => {
|
|
65
|
+
expect(() => downsamplePcm16(new Int16Array([1, 2]), 8000, 16000)).toThrow(
|
|
66
|
+
/cannot upsample/i
|
|
67
|
+
);
|
|
68
|
+
});
|
|
69
|
+
});
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Downsample PCM16 mono audio to a target sample rate.
|
|
3
|
+
*
|
|
4
|
+
* Used by `sendAudioWithSampleRate()` so integrators whose capture pipeline
|
|
5
|
+
* produces audio at the system's native rate (AudioContext defaults to
|
|
6
|
+
* 44.1 kHz or 48 kHz on most desktop/mobile hardware) can hand raw bytes
|
|
7
|
+
* to the SDK without having to bring in their own resampler. The
|
|
8
|
+
* recognition-service `SampleRateValidator` accepts only 16 kHz, so the SDK
|
|
9
|
+
* resamples on the client side before sending.
|
|
10
|
+
*
|
|
11
|
+
* Algorithm: box-filter averaging. For each output sample we average the
|
|
12
|
+
* source samples that fall into its time window. This is the cheapest
|
|
13
|
+
* correct approach for speech ASR — it has a built-in low-pass effect that
|
|
14
|
+
* suppresses aliasing far better than naive decimation or linear
|
|
15
|
+
* interpolation, while staying O(n) with no FFT and no dependencies.
|
|
16
|
+
* For integer ratios (e.g. 48000 → 16000, ratio = 3) it degenerates to a
|
|
17
|
+
* plain 3-sample average; for fractional ratios (e.g. 44100 → 16000) the
|
|
18
|
+
* window count varies by ±1 across output samples.
|
|
19
|
+
*
|
|
20
|
+
* Assumes the input is signed 16-bit little-endian PCM (the SDK's
|
|
21
|
+
* documented `AudioEncoding.LINEAR16` input format). Mono only. Stereo
|
|
22
|
+
* audio must be mixed to mono by the caller.
|
|
23
|
+
*
|
|
24
|
+
* @param input - Source PCM16 audio (ArrayBuffer or any ArrayBufferView).
|
|
25
|
+
* @param srcRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
26
|
+
* @param targetRate - Target sample rate in Hz. Must be ≤ srcRate.
|
|
27
|
+
* @returns A new ArrayBuffer of PCM16 samples at `targetRate`.
|
|
28
|
+
* @throws Error if `targetRate > srcRate` (upsampling is not supported —
|
|
29
|
+
* capture at ≥ targetRate instead).
|
|
30
|
+
*/
|
|
31
|
+
export function downsamplePcm16(
|
|
32
|
+
input: ArrayBuffer | ArrayBufferView,
|
|
33
|
+
srcRate: number,
|
|
34
|
+
targetRate: number
|
|
35
|
+
): ArrayBuffer {
|
|
36
|
+
if (targetRate > srcRate) {
|
|
37
|
+
throw new Error(
|
|
38
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; ` +
|
|
39
|
+
`capture audio at ≥ ${targetRate}Hz instead.`
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Wrap input as Int16Array regardless of how it was passed in. Slice the
|
|
44
|
+
// backing ArrayBuffer to the exact byte range so a typed-array view that
|
|
45
|
+
// shares a larger buffer doesn't pull in neighbouring bytes.
|
|
46
|
+
const buffer = ArrayBuffer.isView(input)
|
|
47
|
+
? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength)
|
|
48
|
+
: input;
|
|
49
|
+
const src = new Int16Array(buffer);
|
|
50
|
+
|
|
51
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
52
|
+
// Return a defensive copy so the caller's typed-array view can't be
|
|
53
|
+
// mutated through the returned buffer.
|
|
54
|
+
return src.slice().buffer;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const ratio = srcRate / targetRate;
|
|
58
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
59
|
+
const dst = new Int16Array(dstLen);
|
|
60
|
+
|
|
61
|
+
for (let i = 0; i < dstLen; i++) {
|
|
62
|
+
const startPos = i * ratio;
|
|
63
|
+
const endPos = (i + 1) * ratio;
|
|
64
|
+
const startIdx = Math.floor(startPos);
|
|
65
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
66
|
+
|
|
67
|
+
let sum = 0;
|
|
68
|
+
let count = 0;
|
|
69
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
70
|
+
// Bounds are guaranteed by the surrounding floor/ceil/min math, but
|
|
71
|
+
// noUncheckedIndexedAccess still narrows to `number | undefined`.
|
|
72
|
+
sum += src[j] ?? 0;
|
|
73
|
+
count++;
|
|
74
|
+
}
|
|
75
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return dst.buffer;
|
|
79
|
+
}
|