@volley/recognition-client-sdk 0.1.782 → 0.1.800

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
3838
3838
  })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
3839
3839
  var SelfServeVllmModel;
3840
3840
  (function(SelfServeVllmModel2) {
3841
+ SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
3842
+ SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
3841
3843
  SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
3842
3844
  })(SelfServeVllmModel || (SelfServeVllmModel = {}));
3843
3845
 
@@ -4642,6 +4644,21 @@ var AudioEncoding;
4642
4644
  return NAME_TO_ENUM.has(nameStr.toUpperCase());
4643
4645
  }
4644
4646
  AudioEncoding2.isNameValid = isNameValid;
4647
+ function coerce2(value, onStringInput) {
4648
+ if (value === void 0) {
4649
+ return AudioEncoding2.LINEAR16;
4650
+ }
4651
+ if (typeof value === "number") {
4652
+ return value;
4653
+ }
4654
+ const result = fromName(value);
4655
+ if (result === void 0) {
4656
+ throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
4657
+ }
4658
+ onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
4659
+ return result;
4660
+ }
4661
+ AudioEncoding2.coerce = coerce2;
4645
4662
  })(AudioEncoding || (AudioEncoding = {}));
4646
4663
  var PREFIX_AUDIO_ENCODING_OFFSET = 128;
4647
4664
  var SampleRate;
@@ -5412,6 +5429,37 @@ var MessageHandler = class {
5412
5429
  }
5413
5430
  };
5414
5431
 
5432
+ // src/utils/audio-resampler.ts
5433
+ function downsamplePcm16(input, srcRate, targetRate) {
5434
+ if (targetRate > srcRate) {
5435
+ throw new Error(
5436
+ `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
5437
+ );
5438
+ }
5439
+ const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
5440
+ const src = new Int16Array(buffer);
5441
+ if (srcRate === targetRate || src.length === 0) {
5442
+ return src.slice().buffer;
5443
+ }
5444
+ const ratio = srcRate / targetRate;
5445
+ const dstLen = Math.floor(src.length / ratio);
5446
+ const dst = new Int16Array(dstLen);
5447
+ for (let i = 0; i < dstLen; i++) {
5448
+ const startPos = i * ratio;
5449
+ const endPos = (i + 1) * ratio;
5450
+ const startIdx = Math.floor(startPos);
5451
+ const endIdx = Math.min(Math.ceil(endPos), src.length);
5452
+ let sum = 0;
5453
+ let count = 0;
5454
+ for (let j = startIdx; j < endIdx; j++) {
5455
+ sum += src[j] ?? 0;
5456
+ count++;
5457
+ }
5458
+ dst[i] = count > 0 ? Math.round(sum / count) : 0;
5459
+ }
5460
+ return dst.buffer;
5461
+ }
5462
+
5415
5463
  // src/errors.ts
5416
5464
  var RecognitionError = class extends Error {
5417
5465
  constructor(errorType, message) {
@@ -5512,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5512
5560
  const retryConfig = config.connectionRetry || {};
5513
5561
  const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
5514
5562
  const delayMs = retryConfig.delayMs ?? 200;
5563
+ const normalizedASRConfig = config.asrRequestConfig ? {
5564
+ ...config.asrRequestConfig,
5565
+ encoding: AudioEncoding.coerce(
5566
+ config.asrRequestConfig.encoding,
5567
+ (warning) => config.logger?.("warn", warning)
5568
+ )
5569
+ } : void 0;
5515
5570
  this.config = {
5516
5571
  url,
5517
5572
  audioUtteranceId,
5518
- ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
5573
+ ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
5519
5574
  ...config.gameContext && { gameContext: config.gameContext },
5520
5575
  ...config.callbackUrls && { callbackUrls: config.callbackUrls },
5521
5576
  onTranscript: config.onTranscript || (() => {
@@ -5713,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5713
5768
  }
5714
5769
  this.sendAudioInternal(audioData);
5715
5770
  }
5771
+ /**
5772
+ * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
5773
+ * to the session's target rate (currently 16 kHz per server validator)
5774
+ * before sending.
5775
+ *
5776
+ * Use this when your capture pipeline produces audio at the system's
5777
+ * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
5778
+ * desktop/mobile hardware — and you don't want to bring your own
5779
+ * resampler. If your audio is already at the target rate, prefer
5780
+ * `sendAudio()` to skip the resample step.
5781
+ *
5782
+ * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
5783
+ * dependencies, has a built-in low-pass effect so aliasing stays out of
5784
+ * the speech band. Suitable for ASR; not a substitute for a high-quality
5785
+ * resampler if you're doing music or full-fidelity processing.
5786
+ *
5787
+ * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
5788
+ * mixed to mono by the caller.
5789
+ *
5790
+ * @param audioData - PCM16 mono audio at `sourceSampleRate`.
5791
+ * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
5792
+ */
5793
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
5794
+ const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5795
+ if (audioData instanceof Blob) {
5796
+ blobToArrayBuffer(audioData).then((arrayBuffer) => {
5797
+ this.sendAudioInternal(
5798
+ downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
5799
+ );
5800
+ }).catch((error) => {
5801
+ this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
5802
+ });
5803
+ return;
5804
+ }
5805
+ this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
5806
+ }
5716
5807
  sendAudioInternal(audioData) {
5717
5808
  const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5718
5809
  if (bytes === 0) return;
@@ -5859,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5859
5950
  model: this.config.asrRequestConfig.model,
5860
5951
  language: this.config.asrRequestConfig.language?.toString() || "en",
5861
5952
  sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
5862
- encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
5953
+ encoding: this.config.asrRequestConfig.encoding,
5863
5954
  interimResults: this.config.asrRequestConfig.interimResults ?? false,
5864
5955
  // Auto-enable useContext if gameContext is provided, or use explicit value if set
5865
5956
  useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -6023,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
6023
6114
  */
6024
6115
  sendAudioNow(audioData) {
6025
6116
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
6026
- const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6117
+ const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
6027
6118
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
6028
6119
  super.sendAudio(
6029
6120
  audioData,
@@ -6091,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
6091
6182
  sendPrefixAudioNow(audioData) {
6092
6183
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
6093
6184
  if (byteLength === 0) return;
6094
- const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6185
+ const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
6095
6186
  const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
6096
6187
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
6097
6188
  this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
@@ -6344,9 +6435,15 @@ var RecognitionVGFStateSchema = z.object({
6344
6435
  // voice end time identified by ASR
6345
6436
  lastNonSilence: z.number().optional(),
6346
6437
  // last non-silence sample time from PCM analysis
6438
+ accumulatedAudioTimeMs: z.number().optional(),
6439
+ // total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
6347
6440
  // Tracking-only metadata
6348
6441
  asrConfig: z.string().optional(),
6349
- // Json format of the ASR config
6442
+ // Json format of the *requested* ASR config (set once at construction).
6443
+ // For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
6444
+ // server after circuit-breaker/fallback — see `sessionConfigured` below.
6445
+ sessionConfigured: SessionConfiguredSchemaV1.optional(),
6446
+ // Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
6350
6447
  startRecordingTimestamp: z.string().optional(),
6351
6448
  // Start of recording. Immutable after set.
6352
6449
  finalRecordingTimestamp: z.string().optional(),
@@ -6367,6 +6464,17 @@ var RecognitionVGFStateSchema = z.object({
6367
6464
  // Support for prompt slot mapping - passed to recognition context when present
6368
6465
  promptSlotMap: z.record(z.string(), z.array(z.string())).optional(),
6369
6466
  // Optional map of slot names to prompt values for recognition context
6467
+ // Optional prompt inputs - when set, forwarded into GameContext at client creation.
6468
+ // Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
6469
+ promptSTT: z.string().optional(),
6470
+ promptSTF: z.string().optional(),
6471
+ promptTTF: z.string().optional(),
6472
+ // Provider-reported phrase detections from the last transcript message.
6473
+ // Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
6474
+ // (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
6475
+ // Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
6476
+ // and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
6477
+ detections: z.array(DetectionV1Schema).optional(),
6370
6478
  // Recognition action processing state - managed externally, SDK preserves but never modifies
6371
6479
  recognitionActionProcessingState: z.string().optional()
6372
6480
  // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
@@ -6438,6 +6546,9 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
6438
6546
  if (result.lastNonSilence !== void 0) {
6439
6547
  newState.lastNonSilence = result.lastNonSilence;
6440
6548
  }
6549
+ if (result.accumulatedAudioTimeMs !== void 0) {
6550
+ newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
6551
+ }
6441
6552
  } else {
6442
6553
  newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
6443
6554
  newState.finalTranscript = result.finalTranscript || "";
@@ -6451,12 +6562,24 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
6451
6562
  if (result.lastNonSilence !== void 0) {
6452
6563
  newState.lastNonSilence = result.lastNonSilence;
6453
6564
  }
6565
+ if (result.accumulatedAudioTimeMs !== void 0) {
6566
+ newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
6567
+ }
6454
6568
  newState.pendingTranscript = "";
6455
6569
  newState.pendingConfidence = void 0;
6456
6570
  }
6571
+ if (result.detections !== void 0) {
6572
+ newState.detections = result.detections;
6573
+ }
6457
6574
  return newState;
6458
6575
  }
6459
- function mapErrorToState(currentState, error) {
6576
+ function mapSessionConfiguredToState(currentState, sessionConfigured) {
6577
+ return {
6578
+ ...currentState,
6579
+ sessionConfigured
6580
+ };
6581
+ }
6582
+ function mapErrorToState(currentState) {
6460
6583
  return {
6461
6584
  ...currentState,
6462
6585
  transcriptionStatus: TranscriptionStatus.ERROR,
@@ -6488,7 +6611,10 @@ function resetRecognitionVGFState(currentState) {
6488
6611
  recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
6489
6612
  finalTranscript: void 0,
6490
6613
  voiceEnd: void 0,
6491
- lastNonSilence: void 0
6614
+ lastNonSilence: void 0,
6615
+ accumulatedAudioTimeMs: void 0,
6616
+ detections: void 0,
6617
+ sessionConfigured: void 0
6492
6618
  };
6493
6619
  }
6494
6620
  function generateUUID() {
@@ -6533,16 +6659,28 @@ var SimplifiedVGFRecognitionClient = class {
6533
6659
  }
6534
6660
  this.state = { ...this.state, startRecordingStatus: "READY" };
6535
6661
  this.expectedUuid = this.state.audioUtteranceId;
6536
- if (this.state.promptSlotMap) {
6662
+ const hasPromptInputs = this.state.promptSlotMap !== void 0 || this.state.promptSTT !== void 0 || this.state.promptSTF !== void 0 || this.state.promptTTF !== void 0;
6663
+ if (hasPromptInputs) {
6537
6664
  if (clientConfig.asrRequestConfig) {
6538
6665
  clientConfig.asrRequestConfig.useContext = true;
6539
6666
  }
6540
6667
  if (!clientConfig.gameContext) {
6541
6668
  if (clientConfig.logger) {
6542
- clientConfig.logger("warn", "[VGF] promptSlotMap found but no gameContext provided. SlotMap will not be sent.");
6669
+ clientConfig.logger("warn", "[VGF] prompt inputs found but no gameContext provided. They will not be sent.");
6543
6670
  }
6544
6671
  } else {
6545
- clientConfig.gameContext.slotMap = this.state.promptSlotMap;
6672
+ if (this.state.promptSlotMap !== void 0) {
6673
+ clientConfig.gameContext.slotMap = this.state.promptSlotMap;
6674
+ }
6675
+ if (this.state.promptSTT !== void 0) {
6676
+ clientConfig.gameContext.promptSTT = this.state.promptSTT;
6677
+ }
6678
+ if (this.state.promptSTF !== void 0) {
6679
+ clientConfig.gameContext.promptSTF = this.state.promptSTF;
6680
+ }
6681
+ if (this.state.promptTTF !== void 0) {
6682
+ clientConfig.gameContext.promptTTF = this.state.promptTTF;
6683
+ }
6546
6684
  }
6547
6685
  }
6548
6686
  this.client = new RealTimeTwoWayWebSocketRecognitionClient({
@@ -6578,6 +6716,22 @@ var SimplifiedVGFRecognitionClient = class {
6578
6716
  clientConfig.onMetadata(metadata);
6579
6717
  }
6580
6718
  },
6719
+ onSessionConfigured: (sessionConfigured) => {
6720
+ if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
6721
+ if (this.logger) {
6722
+ this.logger(
6723
+ "warn",
6724
+ `[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
6725
+ );
6726
+ }
6727
+ return;
6728
+ }
6729
+ this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
6730
+ this.notifyStateChange();
6731
+ if (clientConfig.onSessionConfigured) {
6732
+ clientConfig.onSessionConfigured(sessionConfigured);
6733
+ }
6734
+ },
6581
6735
  onFunctionCall: (result) => {
6582
6736
  if (clientConfig.onFunctionCall) {
6583
6737
  clientConfig.onFunctionCall(result);
@@ -6594,7 +6748,7 @@ var SimplifiedVGFRecognitionClient = class {
6594
6748
  return;
6595
6749
  }
6596
6750
  this.isRecordingAudio = false;
6597
- this.state = mapErrorToState(this.state, error);
6751
+ this.state = mapErrorToState(this.state);
6598
6752
  this.notifyStateChange();
6599
6753
  if (clientConfig.onError) {
6600
6754
  clientConfig.onError(error);
@@ -6619,17 +6773,34 @@ var SimplifiedVGFRecognitionClient = class {
6619
6773
  await this.client.connect();
6620
6774
  }
6621
6775
  sendAudio(audioData) {
6622
- if (!this.isRecordingAudio) {
6623
- this.isRecordingAudio = true;
6624
- this.state = {
6625
- ...this.state,
6626
- startRecordingStatus: "RECORDING",
6627
- startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
6628
- };
6629
- this.notifyStateChange();
6630
- }
6776
+ this.markRecordingStarted();
6631
6777
  this.client.sendAudio(audioData);
6632
6778
  }
6779
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
6780
+ this.markRecordingStarted();
6781
+ this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
6782
+ }
6783
+ sendPrefixAudio(audioData) {
6784
+ this.client.sendPrefixAudio(audioData);
6785
+ }
6786
+ getStats() {
6787
+ return this.client.getStats();
6788
+ }
6789
+ /**
6790
+ * Set VGF recording status to RECORDING on the first audio chunk.
6791
+ * Idempotent — subsequent calls are no-ops until disconnect/stop resets
6792
+ * `isRecordingAudio`.
6793
+ */
6794
+ markRecordingStarted() {
6795
+ if (this.isRecordingAudio) return;
6796
+ this.isRecordingAudio = true;
6797
+ this.state = {
6798
+ ...this.state,
6799
+ startRecordingStatus: "RECORDING",
6800
+ startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
6801
+ };
6802
+ this.notifyStateChange();
6803
+ }
6633
6804
  async stopRecording() {
6634
6805
  this.isRecordingAudio = false;
6635
6806
  this.state = updateStateOnStop(this.state);