@volley/recognition-client-sdk 0.1.782 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +60 -4
- package/dist/index.bundled.d.ts +75 -4
- package/dist/index.js +115 -13
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +95 -4
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +3 -3
- package/src/index.spec.ts +2 -0
- package/src/recognition-client.ts +65 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
|
@@ -89,6 +89,29 @@ export declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketA
|
|
|
89
89
|
*/
|
|
90
90
|
private connectWithRetry;
|
|
91
91
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
92
|
+
/**
|
|
93
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
94
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
95
|
+
* before sending.
|
|
96
|
+
*
|
|
97
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
98
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
99
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
100
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
101
|
+
* `sendAudio()` to skip the resample step.
|
|
102
|
+
*
|
|
103
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
104
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
105
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
106
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
107
|
+
*
|
|
108
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
109
|
+
* mixed to mono by the caller.
|
|
110
|
+
*
|
|
111
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
112
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
113
|
+
*/
|
|
114
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
92
115
|
private sendAudioInternal;
|
|
93
116
|
/**
|
|
94
117
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"recognition-client.d.ts","sourceRoot":"","sources":["../src/recognition-client.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AAGH,OAAO,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AACxD,OAAO,EAML,KAAK,qBAAqB,EAS1B,KAAK,aAAa,EAGnB,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EAAE,WAAW,EAAE,MAAM,+BAA+B,CAAC;AAC5D,OAAO,KAAK,EACV,kBAAkB,EAClB,uBAAuB,EACvB,8CAA8C,EAE/C,MAAM,+BAA+B,CAAC;
|
|
1
|
+
{"version":3,"file":"recognition-client.d.ts","sourceRoot":"","sources":["../src/recognition-client.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AAGH,OAAO,EAAE,oBAAoB,EAAE,MAAM,kBAAkB,CAAC;AACxD,OAAO,EAML,KAAK,qBAAqB,EAS1B,KAAK,aAAa,EAGnB,MAAM,qBAAqB,CAAC;AAE7B,OAAO,EAAE,WAAW,EAAE,MAAM,+BAA+B,CAAC;AAC5D,OAAO,KAAK,EACV,kBAAkB,EAClB,uBAAuB,EACvB,8CAA8C,EAE/C,MAAM,+BAA+B,CAAC;AAWvC;;;;GAIG;AACH,wBAAgB,qBAAqB,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAE3D;AAgCD;;GAEG;AACH,MAAM,MAAM,mBAAmB,GAAG,qBAAqB,CAAC;AAGxD,YAAY,EAAE,8CAA8C,EAAE,MAAM,+BAA+B,CAAC;AAkCpG;;;;;GAKG;AACH,qBAAa,wCACX,SAAQ,oBAAoB,CAAC,MAAM,EAAE,GAAG,EAAE,GAAG,CAC7C,YAAW,kBAAkB;IAE7B,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAK;IAC7C,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,uBAAuB,CAAoB;IAEnE,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,WAAW,CAAkB;IACrC,OAAO,CAAC,YAAY,CAAyC;IAC7D,OAAO,CAAC,iBAAiB,CAAK;IAC9B,OAAO,CAAC,cAAc,CAAiB;IACvC,OAAO,CAAC,KAAK,CAAoC;IACjD,OAAO,CAAC,iBAAiB,CAA4B;IAGrD,OAAO,CAAC,iBAAiB,CAAS;IAGlC,OAAO,CAAC,cAAc,CAAK;IAC3B,OAAO,CAAC,eAAe,CAAK;IAC5B,OAAO,CAAC,qBAAqB,CAAO;IACpC,OAAO,CAAC,iBAAiB,CAAK;gBAElB,MAAM,EAAE,8CAA8C;IAkGlE;;;;;;OAMG;IACH,OAAO,CAAC,GAAG;IAWX;;;OAGG;IACH,OAAO,CAAC,OAAO;IAqBA,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IA6BvC;;;OAGG;YACW,gBAAgB;IAkIrB,SAAS,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI;IAiBzE;;;;;;;;;;;;;;;;;;;;;OAqBG;IACH,uBAAuB,CACrB,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,EAC/C,gBAAgB,EAAE,MAAM,GACvB,IAAI;IAsBP,OAAO,CAAC,iBAAiB;IAsCzB;;;OAGG;IAEG,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IAoCpC,cAAc,IAAI,IAAI;IAwBtB,mBAAmB,IAAI,MAAM;IAI7B,MAAM,IAAI,MAAM;IAIhB,QAAQ,IAAI,WAAW;IAIvB,WAAW,IAAI,OAAO;IAItB,YAAY,IAAI,OAAO;IAIvB,UAAU,IAAI,OAAO;IAIrB,uBAAuB,IAAI,OAAO;IAIlC,mBAAmB,IAAI,OAAO;IAI9B,aAAa,IAAI,OAAO;IAIxB,eAAe,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI;IAmB7C,QAAQ,IAAI,uBAAuB;IAgBnC,SAAS,CAAC,WAAW,IAAI,IAAI;IAwF7B,SAAS,CAAC,cAAc,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,GAAG,IAAI;IA8C5D;;OAEG;IACH,OAAO,CAAC,uBAAuB;IAwB/B,SAAS,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,GAAG,IAAI;cAYlB,SAAS,CAAC,GAAG,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,GAAG,CAAA;KAAE,GAAG,IAAI;IAQ/E;;;OAGG;IACH,OAAO,CAAC,oBAAoB;IAiC5B;;;OAGG;IACH,OAAO,CAAC,YAAY;IAwBpB;;;;;;;;;OASG;IACH,eAAe,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI;IAiBtE;;;OAGG;IACH,OAAO,CAAC,uBAAuB;IAiC/B;;;;OAIG;IACH,OAAO,CAAC,kBAAkB;CA2B3B"}
|
|
@@ -184,6 +184,23 @@ export interface IRecognitionClient {
|
|
|
184
184
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
185
185
|
*/
|
|
186
186
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
187
|
+
/**
|
|
188
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
189
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
190
|
+
* server validator) before transmitting.
|
|
191
|
+
*
|
|
192
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
193
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
194
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
195
|
+
* skip the resample step.
|
|
196
|
+
*
|
|
197
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
198
|
+
* mixed to mono by the caller.
|
|
199
|
+
*
|
|
200
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
201
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
202
|
+
*/
|
|
203
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
187
204
|
/**
|
|
188
205
|
* Stop recording and wait for final transcript
|
|
189
206
|
* The server will close the connection after sending the final transcript.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"recognition-client.types.d.ts","sourceRoot":"","sources":["../src/recognition-client.types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EACL,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,aAAa,EACb,mBAAmB,EACnB,oBAAoB,EACpB,gBAAgB,EAChB,aAAa,EACb,KAAK,EACN,MAAM,qBAAqB,CAAC;AAE7B;;;GAGG;AACH,oBAAY,WAAW;IACrB,+CAA+C;IAC/C,OAAO,YAAY;IAEnB,iDAAiD;IACjD,UAAU,eAAe;IAEzB,8DAA8D;IAC9D,SAAS,cAAc;IAEvB,mCAAmC;IACnC,KAAK,UAAU;IAEf,qDAAqD;IACrD,QAAQ,aAAa;IAErB,4CAA4C;IAC5C,OAAO,YAAY;IAEnB,6CAA6C;IAC7C,MAAM,WAAW;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC,gCAAgC;IAChC,GAAG,EAAE,MAAM,CAAC;IAEZ,yFAAyF;IACzF,YAAY,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC;CACvC;AAGD,MAAM,MAAM,uBAAuB,GAAG,sBAAsB,CAAC;AAE7D,MAAM,WAAW,wBAAwB;IACvC;;;;;;;;;OASG;IACH,GAAG,CAAC,EAAE,MAAM,CAAC;IAEb;;;;;;;;;;;;;;;;OAgBG;IACH,KAAK,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IAEvB,qEAAqE;IACrE,gBAAgB,CAAC,EAAE,gBAAgB,CAAC;IAEpC,qDAAqD;IACrD,WAAW,CAAC,EAAE,aAAa,CAAC;IAE5B;;;;OAIG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,mFAAmF;IACnF,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B;;OAEG;IACH,YAAY,CAAC,EAAE,sBAAsB,EAAE,CAAC;IAExC,qCAAqC;IACrC,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,6FAA6F;IAC7F,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,uCAAuC;IACvC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,wCAAwC;IACxC,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B,6FAA6F;IAC7F,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,2EAA2E;IAC3E,gBAAgB,CAAC,EAAE,WAAW,GAAG,SAAS,CAAC;IAE3C,2GAA2G;IAC3G,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAEhC,2CAA2C;IAC3C,YAAY,CAAC,EAAE,CAAC,MAAM,EAAE,qBAAqB,KAAK,IAAI,CAAC;IAEvD;;;OAGG;IACH,cAAc,CAAC,EAAE,CAAC,MAAM,EAAE,oBAAoB,KAAK,IAAI,CAAC;IAExD,oFAAoF;IACpF,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAElD;;;;OAIG;IACH,cAAc,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,KAAK,IAAI,CAAC;IAEzD,oFAAoF;IACpF,mBAAmB,CAAC,EAAE,CAAC,MAAM,EAAE,mBAAmB,KAAK,IAAI,CAAC;IAE5D,iCAAiC;IACjC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,aAAa,KAAK,IAAI,CAAC;IAEzC,2CAA2C;IAC3C,WAAW,CAAC,EAAE,MAAM,IAAI,CAAC;IAEzB;;;;OAIG;IACH,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;IAExD,uDAAuD;IACvD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,sDAAsD;IACtD,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,wDAAwD;IACxD,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAE9B,uEAAuE;IACvE,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;;;;;;;;;;;;;;;;OAiBG;IACH,eAAe,CAAC,EAAE;QAChB,yEAAyE;QACzE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,oEAAoE;QACpE,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;IAEF;;;;;;OAMG;IAEH,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,GAAG,KAAK,IAAI,CAAC;CAC5F;AAED;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;OAIG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;;OAIG;IACH,SAAS,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI,CAAC;IAEjE;;;;OAIG;IACH,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAE/B;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,IAAI,IAAI,CAAC;IAEvB;;;;OAIG;IACH,mBAAmB,IAAI,MAAM,CAAC;IAE9B;;;OAGG;IACH,QAAQ,IAAI,WAAW,CAAC;IAExB;;;OAGG;IACH,WAAW,IAAI,OAAO,CAAC;IAEvB;;;OAGG;IACH,YAAY,IAAI,OAAO,CAAC;IAExB;;;OAGG;IACH,UAAU,IAAI,OAAO,CAAC;IAEtB;;;OAGG;IACH,uBAAuB,IAAI,OAAO,CAAC;IAEnC;;;OAGG;IACH,mBAAmB,IAAI,OAAO,CAAC;IAE/B;;;OAGG;IACH,QAAQ,IAAI,uBAAuB,CAAC;IAEpC;;;;OAIG;IACH,MAAM,IAAI,MAAM,CAAC;IAEjB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,CAAC;IAE9C;;;;OAIG;IACH,aAAa,IAAI,OAAO,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACtC,uCAAuC;IACvC,cAAc,EAAE,MAAM,CAAC;IAEvB,wCAAwC;IACxC,eAAe,EAAE,MAAM,CAAC;IAExB,4CAA4C;IAC5C,mBAAmB,EAAE,MAAM,CAAC;IAE5B,iDAAiD;IACjD,mBAAmB,EAAE,MAAM,CAAC;IAE5B,yCAAyC;IACzC,qBAAqB,EAAE,MAAM,CAAC;IAE9B,iEAAiE;IACjE,UAAU,EAAE,OAAO,CAAC;CACrB;AAED;;;;GAIG;AAEH,MAAM,WAAW,8CAA+C,SAAQ,wBAAwB;CAG/F"}
|
|
1
|
+
{"version":3,"file":"recognition-client.types.d.ts","sourceRoot":"","sources":["../src/recognition-client.types.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EACL,qBAAqB,EACrB,oBAAoB,EACpB,gBAAgB,EAChB,aAAa,EACb,mBAAmB,EACnB,oBAAoB,EACpB,gBAAgB,EAChB,aAAa,EACb,KAAK,EACN,MAAM,qBAAqB,CAAC;AAE7B;;;GAGG;AACH,oBAAY,WAAW;IACrB,+CAA+C;IAC/C,OAAO,YAAY;IAEnB,iDAAiD;IACjD,UAAU,eAAe;IAEzB,8DAA8D;IAC9D,SAAS,cAAc;IAEvB,mCAAmC;IACnC,KAAK,UAAU;IAEf,qDAAqD;IACrD,QAAQ,aAAa;IAErB,4CAA4C;IAC5C,OAAO,YAAY;IAEnB,6CAA6C;IAC7C,MAAM,WAAW;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,sBAAsB;IACrC,gCAAgC;IAChC,GAAG,EAAE,MAAM,CAAC;IAEZ,yFAAyF;IACzF,YAAY,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,MAAM,CAAC,CAAC;CACvC;AAGD,MAAM,MAAM,uBAAuB,GAAG,sBAAsB,CAAC;AAE7D,MAAM,WAAW,wBAAwB;IACvC;;;;;;;;;OASG;IACH,GAAG,CAAC,EAAE,MAAM,CAAC;IAEb;;;;;;;;;;;;;;;;OAgBG;IACH,KAAK,CAAC,EAAE,KAAK,GAAG,MAAM,CAAC;IAEvB,qEAAqE;IACrE,gBAAgB,CAAC,EAAE,gBAAgB,CAAC;IAEpC,qDAAqD;IACrD,WAAW,CAAC,EAAE,aAAa,CAAC;IAE5B;;;;OAIG;IACH,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,mFAAmF;IACnF,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B;;OAEG;IACH,YAAY,CAAC,EAAE,sBAAsB,EAAE,CAAC;IAExC,qCAAqC;IACrC,MAAM,CAAC,EAAE,MAAM,CAAC;IAEhB,6FAA6F;IAC7F,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,uCAAuC;IACvC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,wCAAwC;IACxC,SAAS,CAAC,EAAE,MAAM,CAAC;IAEnB,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAE1B,6FAA6F;IAC7F,QAAQ,CAAC,EAAE,MAAM,CAAC;IAElB,2EAA2E;IAC3E,gBAAgB,CAAC,EAAE,WAAW,GAAG,SAAS,CAAC;IAE3C,2GAA2G;IAC3G,sBAAsB,CAAC,EAAE,MAAM,CAAC;IAEhC,2CAA2C;IAC3C,YAAY,CAAC,EAAE,CAAC,MAAM,EAAE,qBAAqB,KAAK,IAAI,CAAC;IAEvD;;;OAGG;IACH,cAAc,CAAC,EAAE,CAAC,MAAM,EAAE,oBAAoB,KAAK,IAAI,CAAC;IAExD,oFAAoF;IACpF,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAElD;;;;OAIG;IACH,cAAc,CAAC,EAAE,CAAC,OAAO,EAAE,oBAAoB,KAAK,IAAI,CAAC;IAEzD,oFAAoF;IACpF,mBAAmB,CAAC,EAAE,CAAC,MAAM,EAAE,mBAAmB,KAAK,IAAI,CAAC;IAE5D,iCAAiC;IACjC,OAAO,CAAC,EAAE,CAAC,KAAK,EAAE,aAAa,KAAK,IAAI,CAAC;IAEzC,2CAA2C;IAC3C,WAAW,CAAC,EAAE,MAAM,IAAI,CAAC;IAEzB;;;;OAIG;IACH,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,KAAK,IAAI,CAAC;IAExD,uDAAuD;IACvD,aAAa,CAAC,EAAE,MAAM,CAAC;IAEvB,sDAAsD;IACtD,YAAY,CAAC,EAAE,MAAM,CAAC;IAEtB,wDAAwD;IACxD,oBAAoB,CAAC,EAAE,MAAM,CAAC;IAE9B,uEAAuE;IACvE,eAAe,CAAC,EAAE,MAAM,CAAC;IAEzB;;;;;;;;;;;;;;;;;OAiBG;IACH,eAAe,CAAC,EAAE;QAChB,yEAAyE;QACzE,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,oEAAoE;QACpE,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,CAAC;IAEF;;;;;;OAMG;IAEH,MAAM,CAAC,EAAE,CAAC,KAAK,EAAE,OAAO,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,EAAE,OAAO,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,GAAG,KAAK,IAAI,CAAC;CAC5F;AAED;;;;;GAKG;AACH,MAAM,WAAW,kBAAkB;IACjC;;;;OAIG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;;OAIG;IACH,SAAS,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI,CAAC;IAEjE;;;;;;;;;;;;;;;OAeG;IACH,uBAAuB,CACrB,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,EAC/C,gBAAgB,EAAE,MAAM,GACvB,IAAI,CAAC;IAER;;;;OAIG;IACH,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAE/B;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,IAAI,IAAI,CAAC;IAEvB;;;;OAIG;IACH,mBAAmB,IAAI,MAAM,CAAC;IAE9B;;;OAGG;IACH,QAAQ,IAAI,WAAW,CAAC;IAExB;;;OAGG;IACH,WAAW,IAAI,OAAO,CAAC;IAEvB;;;OAGG;IACH,YAAY,IAAI,OAAO,CAAC;IAExB;;;OAGG;IACH,UAAU,IAAI,OAAO,CAAC;IAEtB;;;OAGG;IACH,uBAAuB,IAAI,OAAO,CAAC;IAEnC;;;OAGG;IACH,mBAAmB,IAAI,OAAO,CAAC;IAE/B;;;OAGG;IACH,QAAQ,IAAI,uBAAuB,CAAC;IAEpC;;;;OAIG;IACH,MAAM,IAAI,MAAM,CAAC;IAEjB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,CAAC;IAE9C;;;;OAIG;IACH,aAAa,IAAI,OAAO,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,uBAAuB;IACtC,uCAAuC;IACvC,cAAc,EAAE,MAAM,CAAC;IAEvB,wCAAwC;IACxC,eAAe,EAAE,MAAM,CAAC;IAExB,4CAA4C;IAC5C,mBAAmB,EAAE,MAAM,CAAC;IAE5B,iDAAiD;IACjD,mBAAmB,EAAE,MAAM,CAAC;IAE5B,yCAAyC;IACzC,qBAAqB,EAAE,MAAM,CAAC;IAE9B,iEAAiE;IACjE,UAAU,EAAE,OAAO,CAAC;CACrB;AAED;;;;GAIG;AAEH,MAAM,WAAW,8CAA+C,SAAQ,wBAAwB;CAG/F"}
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
*/
|
|
10
10
|
import { RecognitionState } from './vgf-recognition-state.js';
|
|
11
11
|
import { IRecognitionClientConfig, ClientState } from './recognition-client.types.js';
|
|
12
|
-
import {
|
|
12
|
+
import type { GameContextV1 } from '@recog/shared-types';
|
|
13
13
|
/**
|
|
14
14
|
* Configuration for SimplifiedVGFRecognitionClient
|
|
15
15
|
*/
|
|
@@ -42,6 +42,14 @@ export interface ISimplifiedVGFRecognitionClient {
|
|
|
42
42
|
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
43
43
|
*/
|
|
44
44
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
45
|
+
/**
|
|
46
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
47
|
+
* downsamples to the session's target rate before transmitting. Use
|
|
48
|
+
* when capture is at the system's native rate (browser AudioContext is
|
|
49
|
+
* typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
|
|
50
|
+
* little-endian PCM, mono.
|
|
51
|
+
*/
|
|
52
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
45
53
|
/**
|
|
46
54
|
* Stop recording and wait for final transcription
|
|
47
55
|
* @returns Promise that resolves when transcription is complete
|
|
@@ -132,6 +140,13 @@ export declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRec
|
|
|
132
140
|
constructor(config: SimplifiedVGFClientConfig);
|
|
133
141
|
connect(): Promise<void>;
|
|
134
142
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
143
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
144
|
+
/**
|
|
145
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
146
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
147
|
+
* `isRecordingAudio`.
|
|
148
|
+
*/
|
|
149
|
+
private markRecordingStarted;
|
|
135
150
|
stopRecording(): Promise<void>;
|
|
136
151
|
stopAbnormally(): void;
|
|
137
152
|
getAudioUtteranceId(): string;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"simplified-vgf-recognition-client.d.ts","sourceRoot":"","sources":["../src/simplified-vgf-recognition-client.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACH,gBAAgB,EAInB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAEH,wBAAwB,EACxB,WAAW,EACd,MAAM,+BAA+B,CAAC;AASvC,OAAO,
|
|
1
|
+
{"version":3,"file":"simplified-vgf-recognition-client.d.ts","sourceRoot":"","sources":["../src/simplified-vgf-recognition-client.ts"],"names":[],"mappings":"AAAA;;;;;;;;GAQG;AAEH,OAAO,EACH,gBAAgB,EAInB,MAAM,4BAA4B,CAAC;AACpC,OAAO,EAEH,wBAAwB,EACxB,WAAW,EACd,MAAM,+BAA+B,CAAC;AASvC,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,qBAAqB,CAAC;AAEzD;;GAEG;AACH,MAAM,WAAW,yBAA0B,SAAQ,wBAAwB;IACvE;;;OAGG;IACH,aAAa,CAAC,EAAE,CAAC,KAAK,EAAE,gBAAgB,KAAK,IAAI,CAAC;IAElD;;;OAGG;IACH,YAAY,CAAC,EAAE,gBAAgB,CAAC;CACnC;AAED;;;;;GAKG;AACH,MAAM,WAAW,+BAA+B;IAE5C;;;OAGG;IACH,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAEzB;;;OAGG;IACH,SAAS,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI,CAAC;IAEjE;;;;;;OAMG;IACH,uBAAuB,CACnB,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,EAC/C,gBAAgB,EAAE,MAAM,GACzB,IAAI,CAAC;IAER;;;OAGG;IACH,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC;IAE/B;;;;;;;;;;;;;;;;OAgBG;IACH,cAAc,IAAI,IAAI,CAAC;IAGvB;;;OAGG;IACH,WAAW,IAAI,gBAAgB,CAAC;IAGhC;;OAEG;IACH,WAAW,IAAI,OAAO,CAAC;IAEvB;;OAEG;IACH,YAAY,IAAI,OAAO,CAAC;IAExB;;OAEG;IACH,UAAU,IAAI,OAAO,CAAC;IAEtB;;OAEG;IACH,uBAAuB,IAAI,OAAO,CAAC;IAEnC;;OAEG;IACH,mBAAmB,IAAI,OAAO,CAAC;IAG/B;;;;;;;OAOG;IACH,eAAe,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI,CAAC;IAE9C;;;OAGG;IACH,aAAa,IAAI,OAAO,CAAC;IAGzB;;OAEG;IACH,mBAAmB,IAAI,MAAM,CAAC;IAE9B;;OAEG;IACH,MAAM,IAAI,MAAM,CAAC;IAEjB;;OAEG;IACH,QAAQ,IAAI,WAAW,CAAC;CAE3B;AAED;;;GAGG;AACH,qBAAa,8BAA+B,YAAW,+BAA+B;IAClF,OAAO,CAAC,MAAM,CAAqB;IACnC,OAAO,CAAC,KAAK,CAAmB;IAChC,OAAO,CAAC,gBAAgB,CAAkB;IAC1C,OAAO,CAAC,mBAAmB,CAAkD;IAC7E,OAAO,CAAC,YAAY,CAAS;IAC7B,OAAO,CAAC,MAAM,CAAqC;IACnD,OAAO,CAAC,oBAAoB,CAAuB;gBAEvC,MAAM,EAAE,yBAAyB;IAqKvC,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;IAK9B,SAAS,CAAC,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,GAAG,IAAI;IAKhE,uBAAuB,CACnB,SAAS,EAAE,WAAW,GAAG,eAAe,GAAG,IAAI,EAC/C,gBAAgB,EAAE,MAAM,GACzB,IAAI;IAKP;;;;OAIG;IACH,OAAO,CAAC,oBAAoB;IAWtB,aAAa,IAAI,OAAO,CAAC,IAAI,CAAC;IA4BpC,cAAc,IAAI,IAAI;IAiCtB,mBAAmB,IAAI,MAAM;IAI7B,MAAM,IAAI,MAAM;IAIhB,QAAQ,IAAI,WAAW;IAIvB,WAAW,IAAI,OAAO;IAItB,YAAY,IAAI,OAAO;IAIvB,UAAU,IAAI,OAAO;IAIrB,uBAAuB,IAAI,OAAO;IAIlC,mBAAmB,IAAI,OAAO;IAI9B,eAAe,CAAC,OAAO,EAAE,aAAa,GAAG,IAAI;IAI7C,aAAa,IAAI,OAAO;IAMxB,WAAW,IAAI,gBAAgB;IAI/B,OAAO,CAAC,gBAAgB;IAMxB,OAAO,CAAC,iBAAiB;CA8B5B;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0CG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE,yBAAyB,GAAG,+BAA+B,CAE5G"}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Downsample PCM16 mono audio to a target sample rate.
|
|
3
|
+
*
|
|
4
|
+
* Used by `sendAudioWithSampleRate()` so integrators whose capture pipeline
|
|
5
|
+
* produces audio at the system's native rate (AudioContext defaults to
|
|
6
|
+
* 44.1 kHz or 48 kHz on most desktop/mobile hardware) can hand raw bytes
|
|
7
|
+
* to the SDK without having to bring in their own resampler. The
|
|
8
|
+
* recognition-service `SampleRateValidator` accepts only 16 kHz, so the SDK
|
|
9
|
+
* resamples on the client side before sending.
|
|
10
|
+
*
|
|
11
|
+
* Algorithm: box-filter averaging. For each output sample we average the
|
|
12
|
+
* source samples that fall into its time window. This is the cheapest
|
|
13
|
+
* correct approach for speech ASR — it has a built-in low-pass effect that
|
|
14
|
+
* suppresses aliasing far better than naive decimation or linear
|
|
15
|
+
* interpolation, while staying O(n) with no FFT and no dependencies.
|
|
16
|
+
* For integer ratios (e.g. 48000 → 16000, ratio = 3) it degenerates to a
|
|
17
|
+
* plain 3-sample average; for fractional ratios (e.g. 44100 → 16000) the
|
|
18
|
+
* window count varies by ±1 across output samples.
|
|
19
|
+
*
|
|
20
|
+
* Assumes the input is signed 16-bit little-endian PCM (the SDK's
|
|
21
|
+
* documented `AudioEncoding.LINEAR16` input format). Mono only. Stereo
|
|
22
|
+
* audio must be mixed to mono by the caller.
|
|
23
|
+
*
|
|
24
|
+
* @param input - Source PCM16 audio (ArrayBuffer or any ArrayBufferView).
|
|
25
|
+
* @param srcRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
26
|
+
* @param targetRate - Target sample rate in Hz. Must be ≤ srcRate.
|
|
27
|
+
* @returns A new ArrayBuffer of PCM16 samples at `targetRate`.
|
|
28
|
+
* @throws Error if `targetRate > srcRate` (upsampling is not supported —
|
|
29
|
+
* capture at ≥ targetRate instead).
|
|
30
|
+
*/
|
|
31
|
+
export declare function downsamplePcm16(input: ArrayBuffer | ArrayBufferView, srcRate: number, targetRate: number): ArrayBuffer;
|
|
32
|
+
//# sourceMappingURL=audio-resampler.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"audio-resampler.d.ts","sourceRoot":"","sources":["../../src/utils/audio-resampler.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AACH,wBAAgB,eAAe,CAC7B,KAAK,EAAE,WAAW,GAAG,eAAe,EACpC,OAAO,EAAE,MAAM,EACf,UAAU,EAAE,MAAM,GACjB,WAAW,CA4Cb"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@volley/recognition-client-sdk",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.799",
|
|
4
4
|
"description": "Recognition Service TypeScript/Node.js Client SDK",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -55,9 +55,9 @@
|
|
|
55
55
|
"ts-jest": "29.2.5",
|
|
56
56
|
"typescript": "5.1.6",
|
|
57
57
|
"@recog/shared-config": "1.0.0",
|
|
58
|
-
"@recog/shared-types": "1.0.0",
|
|
59
58
|
"@recog/shared-utils": "1.0.0",
|
|
60
|
-
"@recog/websocket": "1.0.0"
|
|
59
|
+
"@recog/websocket": "1.0.0",
|
|
60
|
+
"@recog/shared-types": "1.0.0"
|
|
61
61
|
},
|
|
62
62
|
"keywords": [
|
|
63
63
|
"recognition",
|
package/src/index.spec.ts
CHANGED
|
@@ -6,6 +6,8 @@ describe('SDK top-level exports', () => {
|
|
|
6
6
|
expect(RecognitionProvider.SELF_SERVE_VLLM).toBe('self-serve-vllm');
|
|
7
7
|
expect(CartesiaModel.INK_WHISPER).toBe('ink-whisper');
|
|
8
8
|
expect(CartesiaModel.INK_WHISPER_20250604).toBe('ink-whisper-2025-06-04');
|
|
9
|
+
expect(SelfServeVllmModel.QWEN3_ASR_0_6B).toBe('qwen3-asr-0.6b');
|
|
10
|
+
expect(SelfServeVllmModel.QWEN3_ASR_0_6B_WOF_LETTER).toBe('qwen3-asr-0.6b-wof-letter');
|
|
9
11
|
expect(SelfServeVllmModel.QWEN3_ASR_1_7B).toBe('qwen3-asr-1.7b');
|
|
10
12
|
});
|
|
11
13
|
});
|
|
@@ -65,6 +65,7 @@ import type {
|
|
|
65
65
|
import { buildWebSocketUrl } from './utils/url-builder.js';
|
|
66
66
|
import { AudioRingBuffer } from './utils/audio-ring-buffer.js';
|
|
67
67
|
import { MessageHandler } from './utils/message-handler.js';
|
|
68
|
+
import { downsamplePcm16 } from './utils/audio-resampler.js';
|
|
68
69
|
import { ConnectionError } from './errors.js';
|
|
69
70
|
|
|
70
71
|
// ============================================================================
|
|
@@ -215,11 +216,24 @@ export class RealTimeTwoWayWebSocketRecognitionClient
|
|
|
215
216
|
const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4)); // Default: 4 attempts (3 retries), clamp 1-5
|
|
216
217
|
const delayMs = retryConfig.delayMs ?? 200; // Fast retry for short audio sessions
|
|
217
218
|
|
|
219
|
+
// Normalize encoding to AudioEncoding enum at construction so the binary
|
|
220
|
+
// frame header (uint32) and ASRRequest serialization both get a number.
|
|
221
|
+
// String inputs ('linear16', 'LINEAR16') are accepted with a warn-level
|
|
222
|
+
// log; invalid strings throw via AudioEncoding.coerce().
|
|
223
|
+
const normalizedASRConfig = config.asrRequestConfig
|
|
224
|
+
? {
|
|
225
|
+
...config.asrRequestConfig,
|
|
226
|
+
encoding: AudioEncoding.coerce(config.asrRequestConfig.encoding, (warning) =>
|
|
227
|
+
config.logger?.('warn', warning)
|
|
228
|
+
),
|
|
229
|
+
}
|
|
230
|
+
: undefined;
|
|
231
|
+
|
|
218
232
|
// Process config with defaults
|
|
219
233
|
this.config = {
|
|
220
234
|
url,
|
|
221
235
|
audioUtteranceId,
|
|
222
|
-
...(
|
|
236
|
+
...(normalizedASRConfig && { asrRequestConfig: normalizedASRConfig }),
|
|
223
237
|
...(config.gameContext && { gameContext: config.gameContext }),
|
|
224
238
|
...(config.callbackUrls && { callbackUrls: config.callbackUrls }),
|
|
225
239
|
onTranscript: config.onTranscript || (() => {}),
|
|
@@ -488,6 +502,53 @@ export class RealTimeTwoWayWebSocketRecognitionClient
|
|
|
488
502
|
this.sendAudioInternal(audioData);
|
|
489
503
|
}
|
|
490
504
|
|
|
505
|
+
/**
|
|
506
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
507
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
508
|
+
* before sending.
|
|
509
|
+
*
|
|
510
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
511
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
512
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
513
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
514
|
+
* `sendAudio()` to skip the resample step.
|
|
515
|
+
*
|
|
516
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
517
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
518
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
519
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
520
|
+
*
|
|
521
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
522
|
+
* mixed to mono by the caller.
|
|
523
|
+
*
|
|
524
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
525
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
526
|
+
*/
|
|
527
|
+
sendAudioWithSampleRate(
|
|
528
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
529
|
+
sourceSampleRate: number
|
|
530
|
+
): void {
|
|
531
|
+
const targetRate =
|
|
532
|
+
typeof this.config.asrRequestConfig?.sampleRate === 'number'
|
|
533
|
+
? this.config.asrRequestConfig.sampleRate
|
|
534
|
+
: SampleRate.RATE_16000;
|
|
535
|
+
|
|
536
|
+
if (audioData instanceof Blob) {
|
|
537
|
+
blobToArrayBuffer(audioData)
|
|
538
|
+
.then((arrayBuffer) => {
|
|
539
|
+
this.sendAudioInternal(
|
|
540
|
+
downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
|
|
541
|
+
);
|
|
542
|
+
})
|
|
543
|
+
.catch((error) => {
|
|
544
|
+
this.log('warn', 'Failed to convert Blob to ArrayBuffer', error);
|
|
545
|
+
});
|
|
546
|
+
return;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
|
|
550
|
+
}
|
|
551
|
+
|
|
491
552
|
private sendAudioInternal(audioData: ArrayBuffer | ArrayBufferView): void {
|
|
492
553
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
493
554
|
if (bytes === 0) return;
|
|
@@ -692,10 +753,7 @@ export class RealTimeTwoWayWebSocketRecognitionClient
|
|
|
692
753
|
typeof this.config.asrRequestConfig.sampleRate === 'number'
|
|
693
754
|
? this.config.asrRequestConfig.sampleRate
|
|
694
755
|
: SampleRate.RATE_16000,
|
|
695
|
-
encoding:
|
|
696
|
-
typeof this.config.asrRequestConfig.encoding === 'number'
|
|
697
|
-
? this.config.asrRequestConfig.encoding
|
|
698
|
-
: AudioEncoding.LINEAR16,
|
|
756
|
+
encoding: this.config.asrRequestConfig.encoding as AudioEncoding,
|
|
699
757
|
interimResults: this.config.asrRequestConfig.interimResults ?? false,
|
|
700
758
|
// Auto-enable useContext if gameContext is provided, or use explicit value if set
|
|
701
759
|
useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
|
|
@@ -892,7 +950,7 @@ export class RealTimeTwoWayWebSocketRecognitionClient
|
|
|
892
950
|
? audioData.byteLength
|
|
893
951
|
: audioData.byteLength;
|
|
894
952
|
|
|
895
|
-
const encodingId = (this.config.asrRequestConfig?.encoding
|
|
953
|
+
const encodingId = (this.config.asrRequestConfig?.encoding ??
|
|
896
954
|
AudioEncoding.LINEAR16) as AudioEncoding;
|
|
897
955
|
|
|
898
956
|
const sampleRate =
|
|
@@ -987,7 +1045,7 @@ export class RealTimeTwoWayWebSocketRecognitionClient
|
|
|
987
1045
|
|
|
988
1046
|
if (byteLength === 0) return;
|
|
989
1047
|
|
|
990
|
-
const baseEncodingId = (this.config.asrRequestConfig?.encoding
|
|
1048
|
+
const baseEncodingId = (this.config.asrRequestConfig?.encoding ??
|
|
991
1049
|
AudioEncoding.LINEAR16) as AudioEncoding;
|
|
992
1050
|
|
|
993
1051
|
// Add offset to mark as prefix audio
|
|
@@ -239,6 +239,27 @@ export interface IRecognitionClient {
|
|
|
239
239
|
*/
|
|
240
240
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
241
241
|
|
|
242
|
+
/**
|
|
243
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
244
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
245
|
+
* server validator) before transmitting.
|
|
246
|
+
*
|
|
247
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
248
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
249
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
250
|
+
* skip the resample step.
|
|
251
|
+
*
|
|
252
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
253
|
+
* mixed to mono by the caller.
|
|
254
|
+
*
|
|
255
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
256
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
257
|
+
*/
|
|
258
|
+
sendAudioWithSampleRate(
|
|
259
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
260
|
+
sourceSampleRate: number
|
|
261
|
+
): void;
|
|
262
|
+
|
|
242
263
|
/**
|
|
243
264
|
* Stop recording and wait for final transcript
|
|
244
265
|
* The server will close the connection after sending the final transcript.
|
|
@@ -27,7 +27,7 @@ import {
|
|
|
27
27
|
updateStateOnStop,
|
|
28
28
|
resetRecognitionVGFState
|
|
29
29
|
} from './vgf-recognition-mapper.js';
|
|
30
|
-
import {
|
|
30
|
+
import type { GameContextV1 } from '@recog/shared-types';
|
|
31
31
|
|
|
32
32
|
/**
|
|
33
33
|
* Configuration for SimplifiedVGFRecognitionClient
|
|
@@ -66,6 +66,18 @@ export interface ISimplifiedVGFRecognitionClient {
|
|
|
66
66
|
*/
|
|
67
67
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
68
68
|
|
|
69
|
+
/**
|
|
70
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
71
|
+
* downsamples to the session's target rate before transmitting. Use
|
|
72
|
+
* when capture is at the system's native rate (browser AudioContext is
|
|
73
|
+
* typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
|
|
74
|
+
* little-endian PCM, mono.
|
|
75
|
+
*/
|
|
76
|
+
sendAudioWithSampleRate(
|
|
77
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
78
|
+
sourceSampleRate: number
|
|
79
|
+
): void;
|
|
80
|
+
|
|
69
81
|
/**
|
|
70
82
|
* Stop recording and wait for final transcription
|
|
71
83
|
* @returns Promise that resolves when transcription is complete
|
|
@@ -254,7 +266,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
254
266
|
...clientConfig,
|
|
255
267
|
|
|
256
268
|
// These callbacks ONLY update the VGF state sink
|
|
257
|
-
onTranscript: (result) => {
|
|
269
|
+
onTranscript: (result): void => {
|
|
258
270
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
259
271
|
if (result.audioUtteranceId && result.audioUtteranceId !== this.expectedUuid) {
|
|
260
272
|
if (this.logger) {
|
|
@@ -275,7 +287,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
275
287
|
}
|
|
276
288
|
},
|
|
277
289
|
|
|
278
|
-
onMetadata: (metadata) => {
|
|
290
|
+
onMetadata: (metadata): void => {
|
|
279
291
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
280
292
|
if (metadata.audioUtteranceId && metadata.audioUtteranceId !== this.expectedUuid) {
|
|
281
293
|
if (this.logger) {
|
|
@@ -291,14 +303,14 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
291
303
|
}
|
|
292
304
|
},
|
|
293
305
|
|
|
294
|
-
onFunctionCall: (result) => {
|
|
306
|
+
onFunctionCall: (result): void => {
|
|
295
307
|
// Pass through function call - no VGF state changes needed for P2 feature
|
|
296
308
|
if (clientConfig.onFunctionCall) {
|
|
297
309
|
clientConfig.onFunctionCall(result);
|
|
298
310
|
}
|
|
299
311
|
},
|
|
300
312
|
|
|
301
|
-
onError: (error) => {
|
|
313
|
+
onError: (error): void => {
|
|
302
314
|
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
303
315
|
if (error.audioUtteranceId && error.audioUtteranceId !== this.expectedUuid) {
|
|
304
316
|
if (this.logger) {
|
|
@@ -318,14 +330,14 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
318
330
|
}
|
|
319
331
|
},
|
|
320
332
|
|
|
321
|
-
onConnected: () => {
|
|
333
|
+
onConnected: (): void => {
|
|
322
334
|
// Don't update READY here - client can accept audio before connection
|
|
323
335
|
if (clientConfig.onConnected) {
|
|
324
336
|
clientConfig.onConnected();
|
|
325
337
|
}
|
|
326
338
|
},
|
|
327
339
|
|
|
328
|
-
onDisconnected: (code, reason) => {
|
|
340
|
+
onDisconnected: (code, reason): void => {
|
|
329
341
|
this.isRecordingAudio = false; // Reset on disconnect
|
|
330
342
|
if (clientConfig.onDisconnected) {
|
|
331
343
|
clientConfig.onDisconnected(code, reason);
|
|
@@ -343,19 +355,34 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
343
355
|
}
|
|
344
356
|
|
|
345
357
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void {
|
|
346
|
-
|
|
347
|
-
if (!this.isRecordingAudio) {
|
|
348
|
-
this.isRecordingAudio = true;
|
|
349
|
-
this.state = {
|
|
350
|
-
...this.state,
|
|
351
|
-
startRecordingStatus: 'RECORDING',
|
|
352
|
-
startRecordingTimestamp: new Date().toISOString()
|
|
353
|
-
};
|
|
354
|
-
this.notifyStateChange();
|
|
355
|
-
}
|
|
358
|
+
this.markRecordingStarted();
|
|
356
359
|
this.client.sendAudio(audioData);
|
|
357
360
|
}
|
|
358
361
|
|
|
362
|
+
sendAudioWithSampleRate(
|
|
363
|
+
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
364
|
+
sourceSampleRate: number
|
|
365
|
+
): void {
|
|
366
|
+
this.markRecordingStarted();
|
|
367
|
+
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
/**
|
|
371
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
372
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
373
|
+
* `isRecordingAudio`.
|
|
374
|
+
*/
|
|
375
|
+
private markRecordingStarted(): void {
|
|
376
|
+
if (this.isRecordingAudio) return;
|
|
377
|
+
this.isRecordingAudio = true;
|
|
378
|
+
this.state = {
|
|
379
|
+
...this.state,
|
|
380
|
+
startRecordingStatus: 'RECORDING',
|
|
381
|
+
startRecordingTimestamp: new Date().toISOString()
|
|
382
|
+
};
|
|
383
|
+
this.notifyStateChange();
|
|
384
|
+
}
|
|
385
|
+
|
|
359
386
|
async stopRecording(): Promise<void> {
|
|
360
387
|
this.isRecordingAudio = false;
|
|
361
388
|
this.state = updateStateOnStop(this.state);
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
import { downsamplePcm16 } from './audio-resampler.js';
|
|
2
|
+
|
|
3
|
+
// Helper: build an Int16Array from numbers and return its backing buffer.
|
|
4
|
+
const buf = (samples: number[]): ArrayBuffer => new Int16Array(samples).slice().buffer;
|
|
5
|
+
const samples = (b: ArrayBuffer): number[] => Array.from(new Int16Array(b));
|
|
6
|
+
|
|
7
|
+
describe('downsamplePcm16', () => {
|
|
8
|
+
it('returns a defensive copy when srcRate === targetRate', () => {
|
|
9
|
+
const input = new Int16Array([100, 200, 300, 400]);
|
|
10
|
+
const out = downsamplePcm16(input, 16000, 16000);
|
|
11
|
+
expect(samples(out)).toEqual([100, 200, 300, 400]);
|
|
12
|
+
// Mutating the input must not affect the returned buffer.
|
|
13
|
+
input[0] = 9999;
|
|
14
|
+
expect(samples(out)[0]).toBe(100);
|
|
15
|
+
});
|
|
16
|
+
|
|
17
|
+
it('integer ratio 48000 → 16000 averages every 3 source samples', () => {
|
|
18
|
+
// 9 source samples → 3 output samples (ratio = 3)
|
|
19
|
+
const input = buf([0, 6, 12, 30, 60, 90, -3, -6, -9]);
|
|
20
|
+
const out = downsamplePcm16(input, 48000, 16000);
|
|
21
|
+
expect(samples(out)).toEqual([
|
|
22
|
+
Math.round((0 + 6 + 12) / 3), // 6
|
|
23
|
+
Math.round((30 + 60 + 90) / 3), // 60
|
|
24
|
+
Math.round((-3 + -6 + -9) / 3), // -6
|
|
25
|
+
]);
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
it('fractional ratio 44100 → 16000 produces ~44100/16000 output samples', () => {
|
|
29
|
+
// 441 source samples at 44.1kHz ≈ 10ms; expect ~160 output samples at 16kHz.
|
|
30
|
+
const input = new Int16Array(441).fill(1000);
|
|
31
|
+
const out = downsamplePcm16(input, 44100, 16000);
|
|
32
|
+
const outArr = new Int16Array(out);
|
|
33
|
+
expect(outArr.length).toBe(Math.floor(441 / (44100 / 16000))); // = 160
|
|
34
|
+
// Constant input should produce constant output (within rounding).
|
|
35
|
+
for (let i = 0; i < outArr.length; i++) {
|
|
36
|
+
expect(outArr[i]).toBe(1000);
|
|
37
|
+
}
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('accepts ArrayBuffer input as well as typed-array view', () => {
|
|
41
|
+
const view = new Int16Array([10, 20, 30, 40, 50, 60]);
|
|
42
|
+
const fromView = samples(downsamplePcm16(view, 48000, 16000));
|
|
43
|
+
const fromBuf = samples(downsamplePcm16(view.buffer, 48000, 16000));
|
|
44
|
+
expect(fromBuf).toEqual(fromView);
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('handles a typed-array view that shares a larger backing buffer', () => {
|
|
48
|
+
// Simulate a slice from a bigger capture buffer — only the view's bytes
|
|
49
|
+
// should be considered, not the rest of the underlying ArrayBuffer.
|
|
50
|
+
const big = new Int16Array([99, 99, 0, 6, 12, 30, 60, 90, 99, 99]);
|
|
51
|
+
const slice = new Int16Array(big.buffer, 2 * Int16Array.BYTES_PER_ELEMENT, 6);
|
|
52
|
+
const out = downsamplePcm16(slice, 48000, 16000);
|
|
53
|
+
expect(samples(out)).toEqual([
|
|
54
|
+
Math.round((0 + 6 + 12) / 3),
|
|
55
|
+
Math.round((30 + 60 + 90) / 3),
|
|
56
|
+
]);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('returns an empty buffer for empty input', () => {
|
|
60
|
+
const out = downsamplePcm16(new Int16Array(0), 48000, 16000);
|
|
61
|
+
expect(new Int16Array(out).length).toBe(0);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it('throws when asked to upsample', () => {
|
|
65
|
+
expect(() => downsamplePcm16(new Int16Array([1, 2]), 8000, 16000)).toThrow(
|
|
66
|
+
/cannot upsample/i
|
|
67
|
+
);
|
|
68
|
+
});
|
|
69
|
+
});
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Downsample PCM16 mono audio to a target sample rate.
|
|
3
|
+
*
|
|
4
|
+
* Used by `sendAudioWithSampleRate()` so integrators whose capture pipeline
|
|
5
|
+
* produces audio at the system's native rate (AudioContext defaults to
|
|
6
|
+
* 44.1 kHz or 48 kHz on most desktop/mobile hardware) can hand raw bytes
|
|
7
|
+
* to the SDK without having to bring in their own resampler. The
|
|
8
|
+
* recognition-service `SampleRateValidator` accepts only 16 kHz, so the SDK
|
|
9
|
+
* resamples on the client side before sending.
|
|
10
|
+
*
|
|
11
|
+
* Algorithm: box-filter averaging. For each output sample we average the
|
|
12
|
+
* source samples that fall into its time window. This is the cheapest
|
|
13
|
+
* correct approach for speech ASR — it has a built-in low-pass effect that
|
|
14
|
+
* suppresses aliasing far better than naive decimation or linear
|
|
15
|
+
* interpolation, while staying O(n) with no FFT and no dependencies.
|
|
16
|
+
* For integer ratios (e.g. 48000 → 16000, ratio = 3) it degenerates to a
|
|
17
|
+
* plain 3-sample average; for fractional ratios (e.g. 44100 → 16000) the
|
|
18
|
+
* window count varies by ±1 across output samples.
|
|
19
|
+
*
|
|
20
|
+
* Assumes the input is signed 16-bit little-endian PCM (the SDK's
|
|
21
|
+
* documented `AudioEncoding.LINEAR16` input format). Mono only. Stereo
|
|
22
|
+
* audio must be mixed to mono by the caller.
|
|
23
|
+
*
|
|
24
|
+
* @param input - Source PCM16 audio (ArrayBuffer or any ArrayBufferView).
|
|
25
|
+
* @param srcRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
26
|
+
* @param targetRate - Target sample rate in Hz. Must be ≤ srcRate.
|
|
27
|
+
* @returns A new ArrayBuffer of PCM16 samples at `targetRate`.
|
|
28
|
+
* @throws Error if `targetRate > srcRate` (upsampling is not supported —
|
|
29
|
+
* capture at ≥ targetRate instead).
|
|
30
|
+
*/
|
|
31
|
+
export function downsamplePcm16(
|
|
32
|
+
input: ArrayBuffer | ArrayBufferView,
|
|
33
|
+
srcRate: number,
|
|
34
|
+
targetRate: number
|
|
35
|
+
): ArrayBuffer {
|
|
36
|
+
if (targetRate > srcRate) {
|
|
37
|
+
throw new Error(
|
|
38
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; ` +
|
|
39
|
+
`capture audio at ≥ ${targetRate}Hz instead.`
|
|
40
|
+
);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Wrap input as Int16Array regardless of how it was passed in. Slice the
|
|
44
|
+
// backing ArrayBuffer to the exact byte range so a typed-array view that
|
|
45
|
+
// shares a larger buffer doesn't pull in neighbouring bytes.
|
|
46
|
+
const buffer = ArrayBuffer.isView(input)
|
|
47
|
+
? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength)
|
|
48
|
+
: input;
|
|
49
|
+
const src = new Int16Array(buffer);
|
|
50
|
+
|
|
51
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
52
|
+
// Return a defensive copy so the caller's typed-array view can't be
|
|
53
|
+
// mutated through the returned buffer.
|
|
54
|
+
return src.slice().buffer;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const ratio = srcRate / targetRate;
|
|
58
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
59
|
+
const dst = new Int16Array(dstLen);
|
|
60
|
+
|
|
61
|
+
for (let i = 0; i < dstLen; i++) {
|
|
62
|
+
const startPos = i * ratio;
|
|
63
|
+
const endPos = (i + 1) * ratio;
|
|
64
|
+
const startIdx = Math.floor(startPos);
|
|
65
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
66
|
+
|
|
67
|
+
let sum = 0;
|
|
68
|
+
let count = 0;
|
|
69
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
70
|
+
// Bounds are guaranteed by the surrounding floor/ceil/min math, but
|
|
71
|
+
// noUncheckedIndexedAccess still narrows to `number | undefined`.
|
|
72
|
+
sum += src[j] ?? 0;
|
|
73
|
+
count++;
|
|
74
|
+
}
|
|
75
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
return dst.buffer;
|
|
79
|
+
}
|