@volley/recognition-client-sdk 0.1.782 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +60 -4
- package/dist/index.bundled.d.ts +75 -4
- package/dist/index.js +115 -13
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +95 -4
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +3 -3
- package/src/index.spec.ts +2 -0
- package/src/recognition-client.ts +65 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
|
@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
|
|
|
3838
3838
|
})(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
|
|
3839
3839
|
var SelfServeVllmModel;
|
|
3840
3840
|
(function(SelfServeVllmModel2) {
|
|
3841
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
|
|
3842
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
|
|
3841
3843
|
SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
|
|
3842
3844
|
})(SelfServeVllmModel || (SelfServeVllmModel = {}));
|
|
3843
3845
|
|
|
@@ -4619,6 +4621,21 @@ var AudioEncoding;
|
|
|
4619
4621
|
return NAME_TO_ENUM.has(nameStr.toUpperCase());
|
|
4620
4622
|
}
|
|
4621
4623
|
AudioEncoding2.isNameValid = isNameValid;
|
|
4624
|
+
function coerce2(value, onStringInput) {
|
|
4625
|
+
if (value === void 0) {
|
|
4626
|
+
return AudioEncoding2.LINEAR16;
|
|
4627
|
+
}
|
|
4628
|
+
if (typeof value === "number") {
|
|
4629
|
+
return value;
|
|
4630
|
+
}
|
|
4631
|
+
const result = fromName(value);
|
|
4632
|
+
if (result === void 0) {
|
|
4633
|
+
throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
|
|
4634
|
+
}
|
|
4635
|
+
onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
|
|
4636
|
+
return result;
|
|
4637
|
+
}
|
|
4638
|
+
AudioEncoding2.coerce = coerce2;
|
|
4622
4639
|
})(AudioEncoding || (AudioEncoding = {}));
|
|
4623
4640
|
var PREFIX_AUDIO_ENCODING_OFFSET = 128;
|
|
4624
4641
|
var SampleRate;
|
|
@@ -5344,6 +5361,37 @@ var MessageHandler = class {
|
|
|
5344
5361
|
}
|
|
5345
5362
|
};
|
|
5346
5363
|
|
|
5364
|
+
// src/utils/audio-resampler.ts
|
|
5365
|
+
function downsamplePcm16(input, srcRate, targetRate) {
|
|
5366
|
+
if (targetRate > srcRate) {
|
|
5367
|
+
throw new Error(
|
|
5368
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
|
|
5369
|
+
);
|
|
5370
|
+
}
|
|
5371
|
+
const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
|
|
5372
|
+
const src = new Int16Array(buffer);
|
|
5373
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
5374
|
+
return src.slice().buffer;
|
|
5375
|
+
}
|
|
5376
|
+
const ratio = srcRate / targetRate;
|
|
5377
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
5378
|
+
const dst = new Int16Array(dstLen);
|
|
5379
|
+
for (let i = 0; i < dstLen; i++) {
|
|
5380
|
+
const startPos = i * ratio;
|
|
5381
|
+
const endPos = (i + 1) * ratio;
|
|
5382
|
+
const startIdx = Math.floor(startPos);
|
|
5383
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
5384
|
+
let sum = 0;
|
|
5385
|
+
let count = 0;
|
|
5386
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
5387
|
+
sum += src[j] ?? 0;
|
|
5388
|
+
count++;
|
|
5389
|
+
}
|
|
5390
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
5391
|
+
}
|
|
5392
|
+
return dst.buffer;
|
|
5393
|
+
}
|
|
5394
|
+
|
|
5347
5395
|
// src/errors.ts
|
|
5348
5396
|
var RecognitionError = class extends Error {
|
|
5349
5397
|
constructor(errorType, message) {
|
|
@@ -5418,10 +5466,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5418
5466
|
const retryConfig = config.connectionRetry || {};
|
|
5419
5467
|
const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
|
|
5420
5468
|
const delayMs = retryConfig.delayMs ?? 200;
|
|
5469
|
+
const normalizedASRConfig = config.asrRequestConfig ? {
|
|
5470
|
+
...config.asrRequestConfig,
|
|
5471
|
+
encoding: AudioEncoding.coerce(
|
|
5472
|
+
config.asrRequestConfig.encoding,
|
|
5473
|
+
(warning) => config.logger?.("warn", warning)
|
|
5474
|
+
)
|
|
5475
|
+
} : void 0;
|
|
5421
5476
|
this.config = {
|
|
5422
5477
|
url,
|
|
5423
5478
|
audioUtteranceId,
|
|
5424
|
-
...
|
|
5479
|
+
...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
|
|
5425
5480
|
...config.gameContext && { gameContext: config.gameContext },
|
|
5426
5481
|
...config.callbackUrls && { callbackUrls: config.callbackUrls },
|
|
5427
5482
|
onTranscript: config.onTranscript || (() => {
|
|
@@ -5619,6 +5674,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5619
5674
|
}
|
|
5620
5675
|
this.sendAudioInternal(audioData);
|
|
5621
5676
|
}
|
|
5677
|
+
/**
|
|
5678
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
5679
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
5680
|
+
* before sending.
|
|
5681
|
+
*
|
|
5682
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
5683
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
5684
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
5685
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
5686
|
+
* `sendAudio()` to skip the resample step.
|
|
5687
|
+
*
|
|
5688
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
5689
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
5690
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
5691
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
5692
|
+
*
|
|
5693
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
5694
|
+
* mixed to mono by the caller.
|
|
5695
|
+
*
|
|
5696
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
5697
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
5698
|
+
*/
|
|
5699
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
5700
|
+
const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5701
|
+
if (audioData instanceof Blob) {
|
|
5702
|
+
blobToArrayBuffer(audioData).then((arrayBuffer) => {
|
|
5703
|
+
this.sendAudioInternal(
|
|
5704
|
+
downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
|
|
5705
|
+
);
|
|
5706
|
+
}).catch((error) => {
|
|
5707
|
+
this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
|
|
5708
|
+
});
|
|
5709
|
+
return;
|
|
5710
|
+
}
|
|
5711
|
+
this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
|
|
5712
|
+
}
|
|
5622
5713
|
sendAudioInternal(audioData) {
|
|
5623
5714
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5624
5715
|
if (bytes === 0) return;
|
|
@@ -5765,7 +5856,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5765
5856
|
model: this.config.asrRequestConfig.model,
|
|
5766
5857
|
language: this.config.asrRequestConfig.language?.toString() || "en",
|
|
5767
5858
|
sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
|
|
5768
|
-
encoding:
|
|
5859
|
+
encoding: this.config.asrRequestConfig.encoding,
|
|
5769
5860
|
interimResults: this.config.asrRequestConfig.interimResults ?? false,
|
|
5770
5861
|
// Auto-enable useContext if gameContext is provided, or use explicit value if set
|
|
5771
5862
|
useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
|
|
@@ -5929,7 +6020,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5929
6020
|
*/
|
|
5930
6021
|
sendAudioNow(audioData) {
|
|
5931
6022
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5932
|
-
const encodingId = this.config.asrRequestConfig?.encoding
|
|
6023
|
+
const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
5933
6024
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5934
6025
|
super.sendAudio(
|
|
5935
6026
|
audioData,
|
|
@@ -5997,7 +6088,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5997
6088
|
sendPrefixAudioNow(audioData) {
|
|
5998
6089
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5999
6090
|
if (byteLength === 0) return;
|
|
6000
|
-
const baseEncodingId = this.config.asrRequestConfig?.encoding
|
|
6091
|
+
const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
6001
6092
|
const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
|
|
6002
6093
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
6003
6094
|
this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
|