npm - @volley/recognition-client-sdk - Versions diffs - 0.1.782 → 0.1.800 - Mend

@volley/recognition-client-sdk 0.1.782 → 0.1.800

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/browser.bundled.d.ts +75 -4
package/dist/index.bundled.d.ts +198 -87
package/dist/index.js +191 -20
package/dist/index.js.map +4 -4
package/dist/recog-client-sdk.browser.js +95 -4
package/dist/recog-client-sdk.browser.js.map +4 -4
package/dist/recognition-client.d.ts +23 -0
package/dist/recognition-client.d.ts.map +1 -1
package/dist/recognition-client.types.d.ts +32 -0
package/dist/recognition-client.types.d.ts.map +1 -1
package/dist/simplified-vgf-recognition-client.d.ts +22 -85
package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
package/dist/utils/audio-resampler.d.ts +32 -0
package/dist/utils/audio-resampler.d.ts.map +1 -0
package/dist/vgf-recognition-mapper.d.ts +9 -17
package/dist/vgf-recognition-mapper.d.ts.map +1 -1
package/dist/vgf-recognition-state.d.ts +103 -0
package/dist/vgf-recognition-state.d.ts.map +1 -1
package/package.json +1 -1
package/src/index.spec.ts +2 -0
package/src/recognition-client.ts +65 -7
package/src/recognition-client.types.ts +37 -0
package/src/simplified-vgf-recognition-client.spec.ts +0 -27
package/src/simplified-vgf-recognition-client.ts +97 -127
package/src/utils/audio-resampler.spec.ts +69 -0
package/src/utils/audio-resampler.ts +79 -0
package/src/vgf-recognition-mapper.spec.ts +143 -0
package/src/vgf-recognition-mapper.ts +35 -45
package/src/vgf-recognition-state.ts +19 -1

package/dist/index.js CHANGED Viewed

@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
 })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
 var SelfServeVllmModel;
 (function(SelfServeVllmModel2) {
+  SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
+  SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
   SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
 })(SelfServeVllmModel || (SelfServeVllmModel = {}));
@@ -4642,6 +4644,21 @@ var AudioEncoding;
     return NAME_TO_ENUM.has(nameStr.toUpperCase());
   }
   AudioEncoding2.isNameValid = isNameValid;
+  function coerce2(value, onStringInput) {
+    if (value === void 0) {
+      return AudioEncoding2.LINEAR16;
+    }
+    if (typeof value === "number") {
+      return value;
+    }
+    const result = fromName(value);
+    if (result === void 0) {
+      throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
+    }
+    onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
+    return result;
+  }
+  AudioEncoding2.coerce = coerce2;
 })(AudioEncoding || (AudioEncoding = {}));
 var PREFIX_AUDIO_ENCODING_OFFSET = 128;
 var SampleRate;
@@ -5412,6 +5429,37 @@ var MessageHandler = class {
   }
 };
+// src/utils/audio-resampler.ts
+function downsamplePcm16(input, srcRate, targetRate) {
+  if (targetRate > srcRate) {
+    throw new Error(
+      `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
+    );
+  }
+  const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
+  const src = new Int16Array(buffer);
+  if (srcRate === targetRate || src.length === 0) {
+    return src.slice().buffer;
+  }
+  const ratio = srcRate / targetRate;
+  const dstLen = Math.floor(src.length / ratio);
+  const dst = new Int16Array(dstLen);
+  for (let i = 0; i < dstLen; i++) {
+    const startPos = i * ratio;
+    const endPos = (i + 1) * ratio;
+    const startIdx = Math.floor(startPos);
+    const endIdx = Math.min(Math.ceil(endPos), src.length);
+    let sum = 0;
+    let count = 0;
+    for (let j = startIdx; j < endIdx; j++) {
+      sum += src[j] ?? 0;
+      count++;
+    }
+    dst[i] = count > 0 ? Math.round(sum / count) : 0;
+  }
+  return dst.buffer;
+}
 // src/errors.ts
 var RecognitionError = class extends Error {
   constructor(errorType, message) {
@@ -5512,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
     const retryConfig = config.connectionRetry || {};
     const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
     const delayMs = retryConfig.delayMs ?? 200;
+    const normalizedASRConfig = config.asrRequestConfig ? {
+      ...config.asrRequestConfig,
+      encoding: AudioEncoding.coerce(
+        config.asrRequestConfig.encoding,
+        (warning) => config.logger?.("warn", warning)
+      )
+    } : void 0;
     this.config = {
       url,
       audioUtteranceId,
-      ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
+      ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
       ...config.gameContext && { gameContext: config.gameContext },
       ...config.callbackUrls && { callbackUrls: config.callbackUrls },
       onTranscript: config.onTranscript || (() => {
@@ -5713,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
     }
     this.sendAudioInternal(audioData);
   }
+  /**
+   * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
+   * to the session's target rate (currently 16 kHz per server validator)
+   * before sending.
+   *
+   * Use this when your capture pipeline produces audio at the system's
+   * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
+   * desktop/mobile hardware — and you don't want to bring your own
+   * resampler. If your audio is already at the target rate, prefer
+   * `sendAudio()` to skip the resample step.
+   *
+   * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
+   * dependencies, has a built-in low-pass effect so aliasing stays out of
+   * the speech band. Suitable for ASR; not a substitute for a high-quality
+   * resampler if you're doing music or full-fidelity processing.
+   *
+   * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
+   * mixed to mono by the caller.
+   *
+   * @param audioData - PCM16 mono audio at `sourceSampleRate`.
+   * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
+   */
+  sendAudioWithSampleRate(audioData, sourceSampleRate) {
+    const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
+    if (audioData instanceof Blob) {
+      blobToArrayBuffer(audioData).then((arrayBuffer) => {
+        this.sendAudioInternal(
+          downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
+        );
+      }).catch((error) => {
+        this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
+      });
+      return;
+    }
+    this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
+  }
   sendAudioInternal(audioData) {
     const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
     if (bytes === 0) return;
@@ -5859,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
         model: this.config.asrRequestConfig.model,
         language: this.config.asrRequestConfig.language?.toString() || "en",
         sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
-        encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
+        encoding: this.config.asrRequestConfig.encoding,
         interimResults: this.config.asrRequestConfig.interimResults ?? false,
         // Auto-enable useContext if gameContext is provided, or use explicit value if set
         useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -6023,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
    */
   sendAudioNow(audioData) {
     const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
-    const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
+    const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
     const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
     super.sendAudio(
       audioData,
@@ -6091,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
   sendPrefixAudioNow(audioData) {
     const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
     if (byteLength === 0) return;
-    const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
+    const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
     const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
     const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
     this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
@@ -6344,9 +6435,15 @@ var RecognitionVGFStateSchema = z.object({
   // voice end time identified by ASR
   lastNonSilence: z.number().optional(),
   // last non-silence sample time from PCM analysis
+  accumulatedAudioTimeMs: z.number().optional(),
+  // total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
   // Tracking-only metadata
   asrConfig: z.string().optional(),
-  // Json format of the ASR config
+  // Json format of the *requested* ASR config (set once at construction).
+  // For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
+  // server after circuit-breaker/fallback — see `sessionConfigured` below.
+  sessionConfigured: SessionConfiguredSchemaV1.optional(),
+  // Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
   startRecordingTimestamp: z.string().optional(),
   // Start of recording. Immutable after set.
   finalRecordingTimestamp: z.string().optional(),
@@ -6367,6 +6464,17 @@ var RecognitionVGFStateSchema = z.object({
   // Support for prompt slot mapping - passed to recognition context when present
   promptSlotMap: z.record(z.string(), z.array(z.string())).optional(),
   // Optional map of slot names to prompt values for recognition context
+  // Optional prompt inputs - when set, forwarded into GameContext at client creation.
+  // Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
+  promptSTT: z.string().optional(),
+  promptSTF: z.string().optional(),
+  promptTTF: z.string().optional(),
+  // Provider-reported phrase detections from the last transcript message.
+  // Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
+  // (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
+  // Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
+  // and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
+  detections: z.array(DetectionV1Schema).optional(),
   // Recognition action processing state - managed externally, SDK preserves but never modifies
   recognitionActionProcessingState: z.string().optional()
   // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
@@ -6438,6 +6546,9 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
     if (result.lastNonSilence !== void 0) {
       newState.lastNonSilence = result.lastNonSilence;
     }
+    if (result.accumulatedAudioTimeMs !== void 0) {
+      newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
+    }
   } else {
     newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
     newState.finalTranscript = result.finalTranscript || "";
@@ -6451,12 +6562,24 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
     if (result.lastNonSilence !== void 0) {
       newState.lastNonSilence = result.lastNonSilence;
     }
+    if (result.accumulatedAudioTimeMs !== void 0) {
+      newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
+    }
     newState.pendingTranscript = "";
     newState.pendingConfidence = void 0;
   }
+  if (result.detections !== void 0) {
+    newState.detections = result.detections;
+  }
   return newState;
 }
-function mapErrorToState(currentState, error) {
+function mapSessionConfiguredToState(currentState, sessionConfigured) {
+  return {
+    ...currentState,
+    sessionConfigured
+  };
+}
+function mapErrorToState(currentState) {
   return {
     ...currentState,
     transcriptionStatus: TranscriptionStatus.ERROR,
@@ -6488,7 +6611,10 @@ function resetRecognitionVGFState(currentState) {
     recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
     finalTranscript: void 0,
     voiceEnd: void 0,
-    lastNonSilence: void 0
+    lastNonSilence: void 0,
+    accumulatedAudioTimeMs: void 0,
+    detections: void 0,
+    sessionConfigured: void 0
   };
 }
 function generateUUID() {
@@ -6533,16 +6659,28 @@ var SimplifiedVGFRecognitionClient = class {
     }
     this.state = { ...this.state, startRecordingStatus: "READY" };
     this.expectedUuid = this.state.audioUtteranceId;
-    if (this.state.promptSlotMap) {
+    const hasPromptInputs = this.state.promptSlotMap !== void 0 || this.state.promptSTT !== void 0 || this.state.promptSTF !== void 0 || this.state.promptTTF !== void 0;
+    if (hasPromptInputs) {
       if (clientConfig.asrRequestConfig) {
         clientConfig.asrRequestConfig.useContext = true;
       }
       if (!clientConfig.gameContext) {
         if (clientConfig.logger) {
-          clientConfig.logger("warn", "[VGF] promptSlotMap found but no gameContext provided. SlotMap will not be sent.");
+          clientConfig.logger("warn", "[VGF] prompt inputs found but no gameContext provided. They will not be sent.");
         }
       } else {
-        clientConfig.gameContext.slotMap = this.state.promptSlotMap;
+        if (this.state.promptSlotMap !== void 0) {
+          clientConfig.gameContext.slotMap = this.state.promptSlotMap;
+        }
+        if (this.state.promptSTT !== void 0) {
+          clientConfig.gameContext.promptSTT = this.state.promptSTT;
+        }
+        if (this.state.promptSTF !== void 0) {
+          clientConfig.gameContext.promptSTF = this.state.promptSTF;
+        }
+        if (this.state.promptTTF !== void 0) {
+          clientConfig.gameContext.promptTTF = this.state.promptTTF;
+        }
       }
     }
     this.client = new RealTimeTwoWayWebSocketRecognitionClient({
@@ -6578,6 +6716,22 @@ var SimplifiedVGFRecognitionClient = class {
           clientConfig.onMetadata(metadata);
         }
       },
+      onSessionConfigured: (sessionConfigured) => {
+        if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
+          if (this.logger) {
+            this.logger(
+              "warn",
+              `[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
+            );
+          }
+          return;
+        }
+        this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
+        this.notifyStateChange();
+        if (clientConfig.onSessionConfigured) {
+          clientConfig.onSessionConfigured(sessionConfigured);
+        }
+      },
       onFunctionCall: (result) => {
         if (clientConfig.onFunctionCall) {
           clientConfig.onFunctionCall(result);
@@ -6594,7 +6748,7 @@ var SimplifiedVGFRecognitionClient = class {
           return;
         }
         this.isRecordingAudio = false;
-        this.state = mapErrorToState(this.state, error);
+        this.state = mapErrorToState(this.state);
         this.notifyStateChange();
         if (clientConfig.onError) {
           clientConfig.onError(error);
@@ -6619,17 +6773,34 @@ var SimplifiedVGFRecognitionClient = class {
     await this.client.connect();
   }
   sendAudio(audioData) {
-    if (!this.isRecordingAudio) {
-      this.isRecordingAudio = true;
-      this.state = {
-        ...this.state,
-        startRecordingStatus: "RECORDING",
-        startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
-      };
-      this.notifyStateChange();
-    }
+    this.markRecordingStarted();
     this.client.sendAudio(audioData);
   }
+  sendAudioWithSampleRate(audioData, sourceSampleRate) {
+    this.markRecordingStarted();
+    this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
+  }
+  sendPrefixAudio(audioData) {
+    this.client.sendPrefixAudio(audioData);
+  }
+  getStats() {
+    return this.client.getStats();
+  }
+  /**
+   * Set VGF recording status to RECORDING on the first audio chunk.
+   * Idempotent — subsequent calls are no-ops until disconnect/stop resets
+   * `isRecordingAudio`.
+   */
+  markRecordingStarted() {
+    if (this.isRecordingAudio) return;
+    this.isRecordingAudio = true;
+    this.state = {
+      ...this.state,
+      startRecordingStatus: "RECORDING",
+      startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
+    };
+    this.notifyStateChange();
+  }
   async stopRecording() {
     this.isRecordingAudio = false;
     this.state = updateStateOnStop(this.state);