npm - @volley/recognition-client-sdk - Versions diffs - 0.1.782 → 0.1.799 - Mend

@volley/recognition-client-sdk 0.1.782 → 0.1.799

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/dist/browser.bundled.d.ts +60 -4
package/dist/index.bundled.d.ts +75 -4
package/dist/index.js +115 -13
package/dist/index.js.map +4 -4
package/dist/recog-client-sdk.browser.js +95 -4
package/dist/recog-client-sdk.browser.js.map +4 -4
package/dist/recognition-client.d.ts +23 -0
package/dist/recognition-client.d.ts.map +1 -1
package/dist/recognition-client.types.d.ts +17 -0
package/dist/recognition-client.types.d.ts.map +1 -1
package/dist/simplified-vgf-recognition-client.d.ts +16 -1
package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
package/dist/utils/audio-resampler.d.ts +32 -0
package/dist/utils/audio-resampler.d.ts.map +1 -0
package/package.json +3 -3
package/src/index.spec.ts +2 -0
package/src/recognition-client.ts +65 -7
package/src/recognition-client.types.ts +21 -0
package/src/simplified-vgf-recognition-client.ts +44 -17
package/src/utils/audio-resampler.spec.ts +69 -0
package/src/utils/audio-resampler.ts +79 -0

package/dist/browser.bundled.d.ts CHANGED Viewed

@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
 }
 /**
  * Self-serve vLLM batch transcription models
- * Backed by recognition-inference / RunPod `/transcribe`
+ * Backed by recognition-inference / RunPod `/ws/transcribe`
  */
 declare enum SelfServeVllmModel {
+    QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
+    QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
     QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
 }
 /**
@@ -651,6 +653,20 @@ declare namespace AudioEncoding {
      * @returns true if valid encoding name
      */
     function isNameValid(nameStr: string): boolean;
+    /**
+     * Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
+     *
+     * - enum / number → returned as-is (already AudioEncoding-shaped)
+     * - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
+     *   Invokes `onStringInput` with a warning message so callers can route it
+     *   to their preferred logger.
+     * - invalid string → throws (preferred over silent fallback so typos surface)
+     * - undefined → defaults to {@link AudioEncoding.LINEAR16}
+     *
+     * Always normalize at the SDK / server boundary so downstream code can rely
+     * on a numeric AudioEncoding (the wire-level binary frame header is uint32).
+     */
+    function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
 }
 /**
  * Common sample rates (in Hz)
@@ -906,10 +922,10 @@ interface ASRRequestConfig {
      * doesn't respond with is_final=true after stopRecording().
      *
      * - aggressive: 100ms - fast response, may cut off slow providers
-     * - balanced: 500ms - current default, good for most cases
-     * - conservative: 1000ms - wait longer for complex utterances
+     * - balanced: 500ms - good for most cases
+     * - conservative: 1000ms - current default, wait longer for complex utterances
      *
-     * @default 'balanced'
+     * @default 'conservative'
      * @see FinalTranscriptStability enum for detailed descriptions
      */
     finalTranscriptStability?: FinalTranscriptStability | string;
@@ -1404,6 +1420,23 @@ interface IRecognitionClient {
      * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
      */
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
+     * downsamples to the session's target rate (currently 16 kHz, set by the
+     * server validator) before transmitting.
+     *
+     * Use this when your capture pipeline produces audio at the system's
+     * native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
+     * If your audio is already at the target rate, prefer `sendAudio()` to
+     * skip the resample step.
+     *
+     * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+     * mixed to mono by the caller.
+     *
+     * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+     * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+     */
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
     /**
      * Stop recording and wait for final transcript
      * The server will close the connection after sending the final transcript.
@@ -1603,6 +1636,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
      */
     private connectWithRetry;
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
+     * to the session's target rate (currently 16 kHz per server validator)
+     * before sending.
+     *
+     * Use this when your capture pipeline produces audio at the system's
+     * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
+     * desktop/mobile hardware — and you don't want to bring your own
+     * resampler. If your audio is already at the target rate, prefer
+     * `sendAudio()` to skip the resample step.
+     *
+     * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
+     * dependencies, has a built-in low-pass effect so aliasing stays out of
+     * the speech band. Suitable for ASR; not a substitute for a high-quality
+     * resampler if you're doing music or full-fidelity processing.
+     *
+     * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+     * mixed to mono by the caller.
+     *
+     * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+     * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+     */
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
     private sendAudioInternal;
     /**
      * Only active ehwne client is in READY state. otherwise it will return immediately.

package/dist/index.bundled.d.ts CHANGED Viewed

@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
 }
 /**
  * Self-serve vLLM batch transcription models
- * Backed by recognition-inference / RunPod `/transcribe`
+ * Backed by recognition-inference / RunPod `/ws/transcribe`
  */
 declare enum SelfServeVllmModel {
+    QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
+    QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
     QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
 }
 /**
@@ -660,6 +662,20 @@ declare namespace AudioEncoding {
      * @returns true if valid encoding name
      */
     function isNameValid(nameStr: string): boolean;
+    /**
+     * Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
+     *
+     * - enum / number → returned as-is (already AudioEncoding-shaped)
+     * - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
+     *   Invokes `onStringInput` with a warning message so callers can route it
+     *   to their preferred logger.
+     * - invalid string → throws (preferred over silent fallback so typos surface)
+     * - undefined → defaults to {@link AudioEncoding.LINEAR16}
+     *
+     * Always normalize at the SDK / server boundary so downstream code can rely
+     * on a numeric AudioEncoding (the wire-level binary frame header is uint32).
+     */
+    function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
 }
 /**
  * Common sample rates (in Hz)
@@ -1636,10 +1652,10 @@ interface ASRRequestConfig {
      * doesn't respond with is_final=true after stopRecording().
      *
      * - aggressive: 100ms - fast response, may cut off slow providers
-     * - balanced: 500ms - current default, good for most cases
-     * - conservative: 1000ms - wait longer for complex utterances
+     * - balanced: 500ms - good for most cases
+     * - conservative: 1000ms - current default, wait longer for complex utterances
      *
-     * @default 'balanced'
+     * @default 'conservative'
      * @see FinalTranscriptStability enum for detailed descriptions
      */
     finalTranscriptStability?: FinalTranscriptStability | string;
@@ -2173,6 +2189,23 @@ interface IRecognitionClient {
      * @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
      */
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
+     * downsamples to the session's target rate (currently 16 kHz, set by the
+     * server validator) before transmitting.
+     *
+     * Use this when your capture pipeline produces audio at the system's
+     * native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
+     * If your audio is already at the target rate, prefer `sendAudio()` to
+     * skip the resample step.
+     *
+     * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+     * mixed to mono by the caller.
+     *
+     * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+     * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+     */
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
     /**
      * Stop recording and wait for final transcript
      * The server will close the connection after sending the final transcript.
@@ -2378,6 +2411,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
      */
     private connectWithRetry;
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
+     * to the session's target rate (currently 16 kHz per server validator)
+     * before sending.
+     *
+     * Use this when your capture pipeline produces audio at the system's
+     * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
+     * desktop/mobile hardware — and you don't want to bring your own
+     * resampler. If your audio is already at the target rate, prefer
+     * `sendAudio()` to skip the resample step.
+     *
+     * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
+     * dependencies, has a built-in low-pass effect so aliasing stays out of
+     * the speech band. Suitable for ASR; not a substitute for a high-quality
+     * resampler if you're doing music or full-fidelity processing.
+     *
+     * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+     * mixed to mono by the caller.
+     *
+     * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+     * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+     */
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
     private sendAudioInternal;
     /**
      * Only active ehwne client is in READY state. otherwise it will return immediately.
@@ -2801,6 +2857,14 @@ interface ISimplifiedVGFRecognitionClient {
      * @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
      */
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    /**
+     * Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
+     * downsamples to the session's target rate before transmitting. Use
+     * when capture is at the system's native rate (browser AudioContext is
+     * typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
+     * little-endian PCM, mono.
+     */
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
     /**
      * Stop recording and wait for final transcription
      * @returns Promise that resolves when transcription is complete
@@ -2891,6 +2955,13 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
     constructor(config: SimplifiedVGFClientConfig);
     connect(): Promise<void>;
     sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
+    sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
+    /**
+     * Set VGF recording status to RECORDING on the first audio chunk.
+     * Idempotent — subsequent calls are no-ops until disconnect/stop resets
+     * `isRecordingAudio`.
+     */
+    private markRecordingStarted;
     stopRecording(): Promise<void>;
     stopAbnormally(): void;
     getAudioUtteranceId(): string;

package/dist/index.js CHANGED Viewed

@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
 })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
 var SelfServeVllmModel;
 (function(SelfServeVllmModel2) {
+  SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
+  SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
   SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
 })(SelfServeVllmModel || (SelfServeVllmModel = {}));
@@ -4642,6 +4644,21 @@ var AudioEncoding;
     return NAME_TO_ENUM.has(nameStr.toUpperCase());
   }
   AudioEncoding2.isNameValid = isNameValid;
+  function coerce2(value, onStringInput) {
+    if (value === void 0) {
+      return AudioEncoding2.LINEAR16;
+    }
+    if (typeof value === "number") {
+      return value;
+    }
+    const result = fromName(value);
+    if (result === void 0) {
+      throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
+    }
+    onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
+    return result;
+  }
+  AudioEncoding2.coerce = coerce2;
 })(AudioEncoding || (AudioEncoding = {}));
 var PREFIX_AUDIO_ENCODING_OFFSET = 128;
 var SampleRate;
@@ -5412,6 +5429,37 @@ var MessageHandler = class {
   }
 };
+// src/utils/audio-resampler.ts
+function downsamplePcm16(input, srcRate, targetRate) {
+  if (targetRate > srcRate) {
+    throw new Error(
+      `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
+    );
+  }
+  const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
+  const src = new Int16Array(buffer);
+  if (srcRate === targetRate || src.length === 0) {
+    return src.slice().buffer;
+  }
+  const ratio = srcRate / targetRate;
+  const dstLen = Math.floor(src.length / ratio);
+  const dst = new Int16Array(dstLen);
+  for (let i = 0; i < dstLen; i++) {
+    const startPos = i * ratio;
+    const endPos = (i + 1) * ratio;
+    const startIdx = Math.floor(startPos);
+    const endIdx = Math.min(Math.ceil(endPos), src.length);
+    let sum = 0;
+    let count = 0;
+    for (let j = startIdx; j < endIdx; j++) {
+      sum += src[j] ?? 0;
+      count++;
+    }
+    dst[i] = count > 0 ? Math.round(sum / count) : 0;
+  }
+  return dst.buffer;
+}
 // src/errors.ts
 var RecognitionError = class extends Error {
   constructor(errorType, message) {
@@ -5512,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
     const retryConfig = config.connectionRetry || {};
     const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
     const delayMs = retryConfig.delayMs ?? 200;
+    const normalizedASRConfig = config.asrRequestConfig ? {
+      ...config.asrRequestConfig,
+      encoding: AudioEncoding.coerce(
+        config.asrRequestConfig.encoding,
+        (warning) => config.logger?.("warn", warning)
+      )
+    } : void 0;
     this.config = {
       url,
       audioUtteranceId,
-      ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
+      ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
       ...config.gameContext && { gameContext: config.gameContext },
       ...config.callbackUrls && { callbackUrls: config.callbackUrls },
       onTranscript: config.onTranscript || (() => {
@@ -5713,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
     }
     this.sendAudioInternal(audioData);
   }
+  /**
+   * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
+   * to the session's target rate (currently 16 kHz per server validator)
+   * before sending.
+   *
+   * Use this when your capture pipeline produces audio at the system's
+   * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
+   * desktop/mobile hardware — and you don't want to bring your own
+   * resampler. If your audio is already at the target rate, prefer
+   * `sendAudio()` to skip the resample step.
+   *
+   * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
+   * dependencies, has a built-in low-pass effect so aliasing stays out of
+   * the speech band. Suitable for ASR; not a substitute for a high-quality
+   * resampler if you're doing music or full-fidelity processing.
+   *
+   * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+   * mixed to mono by the caller.
+   *
+   * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+   * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+   */
+  sendAudioWithSampleRate(audioData, sourceSampleRate) {
+    const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
+    if (audioData instanceof Blob) {
+      blobToArrayBuffer(audioData).then((arrayBuffer) => {
+        this.sendAudioInternal(
+          downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
+        );
+      }).catch((error) => {
+        this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
+      });
+      return;
+    }
+    this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
+  }
   sendAudioInternal(audioData) {
     const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
     if (bytes === 0) return;
@@ -5859,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
         model: this.config.asrRequestConfig.model,
         language: this.config.asrRequestConfig.language?.toString() || "en",
         sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
-        encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
+        encoding: this.config.asrRequestConfig.encoding,
         interimResults: this.config.asrRequestConfig.interimResults ?? false,
         // Auto-enable useContext if gameContext is provided, or use explicit value if set
         useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -6023,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
    */
   sendAudioNow(audioData) {
     const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
-    const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
+    const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
     const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
     super.sendAudio(
       audioData,
@@ -6091,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
   sendPrefixAudioNow(audioData) {
     const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
     if (byteLength === 0) return;
-    const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
+    const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
     const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
     const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
     this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
@@ -6619,17 +6710,28 @@ var SimplifiedVGFRecognitionClient = class {
     await this.client.connect();
   }
   sendAudio(audioData) {
-    if (!this.isRecordingAudio) {
-      this.isRecordingAudio = true;
-      this.state = {
-        ...this.state,
-        startRecordingStatus: "RECORDING",
-        startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
-      };
-      this.notifyStateChange();
-    }
+    this.markRecordingStarted();
     this.client.sendAudio(audioData);
   }
+  sendAudioWithSampleRate(audioData, sourceSampleRate) {
+    this.markRecordingStarted();
+    this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
+  }
+  /**
+   * Set VGF recording status to RECORDING on the first audio chunk.
+   * Idempotent — subsequent calls are no-ops until disconnect/stop resets
+   * `isRecordingAudio`.
+   */
+  markRecordingStarted() {
+    if (this.isRecordingAudio) return;
+    this.isRecordingAudio = true;
+    this.state = {
+      ...this.state,
+      startRecordingStatus: "RECORDING",
+      startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
+    };
+    this.notifyStateChange();
+  }
   async stopRecording() {
     this.isRecordingAudio = false;
     this.state = updateStateOnStop(this.state);