@volley/recognition-client-sdk 0.1.782 → 0.1.800

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
3838
3838
  })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
3839
3839
  var SelfServeVllmModel;
3840
3840
  (function(SelfServeVllmModel2) {
3841
+ SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
3842
+ SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
3841
3843
  SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
3842
3844
  })(SelfServeVllmModel || (SelfServeVllmModel = {}));
3843
3845
 
@@ -4619,6 +4621,21 @@ var AudioEncoding;
4619
4621
  return NAME_TO_ENUM.has(nameStr.toUpperCase());
4620
4622
  }
4621
4623
  AudioEncoding2.isNameValid = isNameValid;
4624
+ function coerce2(value, onStringInput) {
4625
+ if (value === void 0) {
4626
+ return AudioEncoding2.LINEAR16;
4627
+ }
4628
+ if (typeof value === "number") {
4629
+ return value;
4630
+ }
4631
+ const result = fromName(value);
4632
+ if (result === void 0) {
4633
+ throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
4634
+ }
4635
+ onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
4636
+ return result;
4637
+ }
4638
+ AudioEncoding2.coerce = coerce2;
4622
4639
  })(AudioEncoding || (AudioEncoding = {}));
4623
4640
  var PREFIX_AUDIO_ENCODING_OFFSET = 128;
4624
4641
  var SampleRate;
@@ -5344,6 +5361,37 @@ var MessageHandler = class {
5344
5361
  }
5345
5362
  };
5346
5363
 
5364
+ // src/utils/audio-resampler.ts
5365
+ function downsamplePcm16(input, srcRate, targetRate) {
5366
+ if (targetRate > srcRate) {
5367
+ throw new Error(
5368
+ `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
5369
+ );
5370
+ }
5371
+ const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
5372
+ const src = new Int16Array(buffer);
5373
+ if (srcRate === targetRate || src.length === 0) {
5374
+ return src.slice().buffer;
5375
+ }
5376
+ const ratio = srcRate / targetRate;
5377
+ const dstLen = Math.floor(src.length / ratio);
5378
+ const dst = new Int16Array(dstLen);
5379
+ for (let i = 0; i < dstLen; i++) {
5380
+ const startPos = i * ratio;
5381
+ const endPos = (i + 1) * ratio;
5382
+ const startIdx = Math.floor(startPos);
5383
+ const endIdx = Math.min(Math.ceil(endPos), src.length);
5384
+ let sum = 0;
5385
+ let count = 0;
5386
+ for (let j = startIdx; j < endIdx; j++) {
5387
+ sum += src[j] ?? 0;
5388
+ count++;
5389
+ }
5390
+ dst[i] = count > 0 ? Math.round(sum / count) : 0;
5391
+ }
5392
+ return dst.buffer;
5393
+ }
5394
+
5347
5395
  // src/errors.ts
5348
5396
  var RecognitionError = class extends Error {
5349
5397
  constructor(errorType, message) {
@@ -5418,10 +5466,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5418
5466
  const retryConfig = config.connectionRetry || {};
5419
5467
  const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
5420
5468
  const delayMs = retryConfig.delayMs ?? 200;
5469
+ const normalizedASRConfig = config.asrRequestConfig ? {
5470
+ ...config.asrRequestConfig,
5471
+ encoding: AudioEncoding.coerce(
5472
+ config.asrRequestConfig.encoding,
5473
+ (warning) => config.logger?.("warn", warning)
5474
+ )
5475
+ } : void 0;
5421
5476
  this.config = {
5422
5477
  url,
5423
5478
  audioUtteranceId,
5424
- ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
5479
+ ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
5425
5480
  ...config.gameContext && { gameContext: config.gameContext },
5426
5481
  ...config.callbackUrls && { callbackUrls: config.callbackUrls },
5427
5482
  onTranscript: config.onTranscript || (() => {
@@ -5619,6 +5674,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5619
5674
  }
5620
5675
  this.sendAudioInternal(audioData);
5621
5676
  }
5677
+ /**
5678
+ * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
5679
+ * to the session's target rate (currently 16 kHz per server validator)
5680
+ * before sending.
5681
+ *
5682
+ * Use this when your capture pipeline produces audio at the system's
5683
+ * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
5684
+ * desktop/mobile hardware — and you don't want to bring your own
5685
+ * resampler. If your audio is already at the target rate, prefer
5686
+ * `sendAudio()` to skip the resample step.
5687
+ *
5688
+ * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
5689
+ * dependencies, has a built-in low-pass effect so aliasing stays out of
5690
+ * the speech band. Suitable for ASR; not a substitute for a high-quality
5691
+ * resampler if you're doing music or full-fidelity processing.
5692
+ *
5693
+ * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
5694
+ * mixed to mono by the caller.
5695
+ *
5696
+ * @param audioData - PCM16 mono audio at `sourceSampleRate`.
5697
+ * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
5698
+ */
5699
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
5700
+ const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5701
+ if (audioData instanceof Blob) {
5702
+ blobToArrayBuffer(audioData).then((arrayBuffer) => {
5703
+ this.sendAudioInternal(
5704
+ downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
5705
+ );
5706
+ }).catch((error) => {
5707
+ this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
5708
+ });
5709
+ return;
5710
+ }
5711
+ this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
5712
+ }
5622
5713
  sendAudioInternal(audioData) {
5623
5714
  const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5624
5715
  if (bytes === 0) return;
@@ -5765,7 +5856,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5765
5856
  model: this.config.asrRequestConfig.model,
5766
5857
  language: this.config.asrRequestConfig.language?.toString() || "en",
5767
5858
  sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
5768
- encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
5859
+ encoding: this.config.asrRequestConfig.encoding,
5769
5860
  interimResults: this.config.asrRequestConfig.interimResults ?? false,
5770
5861
  // Auto-enable useContext if gameContext is provided, or use explicit value if set
5771
5862
  useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -5929,7 +6020,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5929
6020
  */
5930
6021
  sendAudioNow(audioData) {
5931
6022
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5932
- const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6023
+ const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
5933
6024
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5934
6025
  super.sendAudio(
5935
6026
  audioData,
@@ -5997,7 +6088,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5997
6088
  sendPrefixAudioNow(audioData) {
5998
6089
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5999
6090
  if (byteLength === 0) return;
6000
- const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6091
+ const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
6001
6092
  const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
6002
6093
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
6003
6094
  this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });