@volley/recognition-client-sdk 0.1.767 → 0.1.799

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -3749,6 +3749,7 @@ var RecognitionProvider;
3749
3749
  RecognitionProvider2["BEDROCK"] = "bedrock";
3750
3750
  RecognitionProvider2["INWORLD_STT"] = "inworld-stt";
3751
3751
  RecognitionProvider2["AWS_TRANSCRIBE"] = "aws-transcribe";
3752
+ RecognitionProvider2["AMAZON_NOVA_SONIC"] = "amazon-nova-sonic";
3752
3753
  RecognitionProvider2["TEST_ASR_PROVIDER_QUOTA"] = "test-asr-provider-quota";
3753
3754
  RecognitionProvider2["TEST_ASR_STREAMING"] = "test-asr-streaming";
3754
3755
  })(RecognitionProvider || (RecognitionProvider = {}));
@@ -3799,6 +3800,7 @@ var ElevenLabsModel;
3799
3800
  })(ElevenLabsModel || (ElevenLabsModel = {}));
3800
3801
  var OpenAIRealtimeModel;
3801
3802
  (function(OpenAIRealtimeModel2) {
3803
+ OpenAIRealtimeModel2["GPT_REALTIME_WHISPER"] = "gpt-realtime-whisper";
3802
3804
  OpenAIRealtimeModel2["GPT_4O_TRANSCRIBE"] = "gpt-4o-transcribe";
3803
3805
  OpenAIRealtimeModel2["GPT_4O_MINI_TRANSCRIBE"] = "gpt-4o-mini-transcribe";
3804
3806
  })(OpenAIRealtimeModel || (OpenAIRealtimeModel = {}));
@@ -3829,8 +3831,15 @@ var AwsTranscribeModel;
3829
3831
  (function(AwsTranscribeModel2) {
3830
3832
  AwsTranscribeModel2["DEFAULT"] = "default";
3831
3833
  })(AwsTranscribeModel || (AwsTranscribeModel = {}));
3834
+ var AmazonNovaSonicModel;
3835
+ (function(AmazonNovaSonicModel2) {
3836
+ AmazonNovaSonicModel2["AMAZON_NOVA_SONIC_V1"] = "amazon.nova-sonic-v1:0";
3837
+ AmazonNovaSonicModel2["AMAZON_NOVA_2_SONIC"] = "amazon.nova-2-sonic-v1:0";
3838
+ })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
3832
3839
  var SelfServeVllmModel;
3833
3840
  (function(SelfServeVllmModel2) {
3841
+ SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
3842
+ SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
3834
3843
  SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
3835
3844
  })(SelfServeVllmModel || (SelfServeVllmModel = {}));
3836
3845
 
@@ -3845,6 +3854,18 @@ var RecognitionResultTypeV1;
3845
3854
  RecognitionResultTypeV12["AUDIO_METRICS"] = "AudioMetrics";
3846
3855
  RecognitionResultTypeV12["SESSION_CONFIGURED"] = "SessionConfigured";
3847
3856
  })(RecognitionResultTypeV1 || (RecognitionResultTypeV1 = {}));
3857
+ var DetectionTypeV1;
3858
+ (function(DetectionTypeV12) {
3859
+ DetectionTypeV12["SEARCH"] = "search";
3860
+ })(DetectionTypeV1 || (DetectionTypeV1 = {}));
3861
+ var DetectionV1Schema = z.object({
3862
+ type: z.nativeEnum(DetectionTypeV1),
3863
+ query: z.string(),
3864
+ score: z.number().min(0).max(1),
3865
+ startMs: z.number().optional(),
3866
+ endMs: z.number().optional()
3867
+ // Audio time (ms from stream start) where the hit ends
3868
+ });
3848
3869
  var TranscriptionResultSchemaV1 = z.object({
3849
3870
  type: z.literal(RecognitionResultTypeV1.TRANSCRIPTION),
3850
3871
  audioUtteranceId: z.string(),
@@ -3863,8 +3884,9 @@ var TranscriptionResultSchemaV1 = z.object({
3863
3884
  endTimestamp: z.number().optional(),
3864
3885
  receivedAtMs: z.number().optional(),
3865
3886
  accumulatedAudioTimeMs: z.number().optional(),
3866
- rawAudioTimeMs: z.number().optional()
3867
- // Total audio duration sent to provider (includes prefix)
3887
+ rawAudioTimeMs: z.number().optional(),
3888
+ detections: z.array(DetectionV1Schema).optional()
3889
+ // Provider-reported phrase detections (query + score, optionally startMs/endMs). Always populated when the provider returns hits, regardless of `appendSearch`. Other providers leave this undefined.
3868
3890
  });
3869
3891
  var FunctionCallResultSchemaV1 = z.object({
3870
3892
  type: z.literal(RecognitionResultTypeV1.FUNCTION_CALL),
@@ -4117,7 +4139,15 @@ var TranscriptMessageSchema = z.object({
4117
4139
  * @example true
4118
4140
  * @default false
4119
4141
  */
4120
- is_fallback: z.boolean().optional()
4142
+ is_fallback: z.boolean().optional(),
4143
+ /**
4144
+ * Provider-reported phrase detections (query + score, optionally
4145
+ * startMs/endMs). Always populated when the provider returns hits,
4146
+ * regardless of `appendSearch` or scene gating. Other providers leave
4147
+ * this undefined.
4148
+ * @example [{ query: 'justin bieber one time', score: 0.78, startMs: 1200, endMs: 2800 }]
4149
+ */
4150
+ detections: z.array(DetectionV1Schema).optional()
4121
4151
  });
4122
4152
  var VADEndSignalSchema = z.object({
4123
4153
  type: z.literal(ProviderMessageType.VAD_END_SIGNAL),
@@ -4457,6 +4487,9 @@ var ASRRequestSchemaV1 = z.object({
4457
4487
  // Streaming audio metrics opt-in: when > 0, server emits AudioMetrics results throttled to this interval (ms).
4458
4488
  // Undefined / 0 disables streaming audio metrics (final metrics still embedded in Metadata).
4459
4489
  audioMetricsIntervalMs: z.number().optional(),
4490
+ // Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
4491
+ // Active only when (model = deepgram nova-2) AND (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch in asr-config.types.ts for full semantics.
4492
+ appendSearch: z.boolean().optional(),
4460
4493
  // Debug options (FOR DEBUG/TESTING ONLY - not for production use)
4461
4494
  debugCommand: RequestDebugCommandSchema
4462
4495
  });
@@ -4611,6 +4644,21 @@ var AudioEncoding;
4611
4644
  return NAME_TO_ENUM.has(nameStr.toUpperCase());
4612
4645
  }
4613
4646
  AudioEncoding2.isNameValid = isNameValid;
4647
+ function coerce2(value, onStringInput) {
4648
+ if (value === void 0) {
4649
+ return AudioEncoding2.LINEAR16;
4650
+ }
4651
+ if (typeof value === "number") {
4652
+ return value;
4653
+ }
4654
+ const result = fromName(value);
4655
+ if (result === void 0) {
4656
+ throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
4657
+ }
4658
+ onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
4659
+ return result;
4660
+ }
4661
+ AudioEncoding2.coerce = coerce2;
4614
4662
  })(AudioEncoding || (AudioEncoding = {}));
4615
4663
  var PREFIX_AUDIO_ENCODING_OFFSET = 128;
4616
4664
  var SampleRate;
@@ -5381,6 +5429,37 @@ var MessageHandler = class {
5381
5429
  }
5382
5430
  };
5383
5431
 
5432
+ // src/utils/audio-resampler.ts
5433
+ function downsamplePcm16(input, srcRate, targetRate) {
5434
+ if (targetRate > srcRate) {
5435
+ throw new Error(
5436
+ `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
5437
+ );
5438
+ }
5439
+ const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
5440
+ const src = new Int16Array(buffer);
5441
+ if (srcRate === targetRate || src.length === 0) {
5442
+ return src.slice().buffer;
5443
+ }
5444
+ const ratio = srcRate / targetRate;
5445
+ const dstLen = Math.floor(src.length / ratio);
5446
+ const dst = new Int16Array(dstLen);
5447
+ for (let i = 0; i < dstLen; i++) {
5448
+ const startPos = i * ratio;
5449
+ const endPos = (i + 1) * ratio;
5450
+ const startIdx = Math.floor(startPos);
5451
+ const endIdx = Math.min(Math.ceil(endPos), src.length);
5452
+ let sum = 0;
5453
+ let count = 0;
5454
+ for (let j = startIdx; j < endIdx; j++) {
5455
+ sum += src[j] ?? 0;
5456
+ count++;
5457
+ }
5458
+ dst[i] = count > 0 ? Math.round(sum / count) : 0;
5459
+ }
5460
+ return dst.buffer;
5461
+ }
5462
+
5384
5463
  // src/errors.ts
5385
5464
  var RecognitionError = class extends Error {
5386
5465
  constructor(errorType, message) {
@@ -5481,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5481
5560
  const retryConfig = config.connectionRetry || {};
5482
5561
  const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
5483
5562
  const delayMs = retryConfig.delayMs ?? 200;
5563
+ const normalizedASRConfig = config.asrRequestConfig ? {
5564
+ ...config.asrRequestConfig,
5565
+ encoding: AudioEncoding.coerce(
5566
+ config.asrRequestConfig.encoding,
5567
+ (warning) => config.logger?.("warn", warning)
5568
+ )
5569
+ } : void 0;
5484
5570
  this.config = {
5485
5571
  url,
5486
5572
  audioUtteranceId,
5487
- ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
5573
+ ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
5488
5574
  ...config.gameContext && { gameContext: config.gameContext },
5489
5575
  ...config.callbackUrls && { callbackUrls: config.callbackUrls },
5490
5576
  onTranscript: config.onTranscript || (() => {
@@ -5682,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5682
5768
  }
5683
5769
  this.sendAudioInternal(audioData);
5684
5770
  }
5771
+ /**
5772
+ * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
5773
+ * to the session's target rate (currently 16 kHz per server validator)
5774
+ * before sending.
5775
+ *
5776
+ * Use this when your capture pipeline produces audio at the system's
5777
+ * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
5778
+ * desktop/mobile hardware — and you don't want to bring your own
5779
+ * resampler. If your audio is already at the target rate, prefer
5780
+ * `sendAudio()` to skip the resample step.
5781
+ *
5782
+ * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
5783
+ * dependencies, has a built-in low-pass effect so aliasing stays out of
5784
+ * the speech band. Suitable for ASR; not a substitute for a high-quality
5785
+ * resampler if you're doing music or full-fidelity processing.
5786
+ *
5787
+ * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
5788
+ * mixed to mono by the caller.
5789
+ *
5790
+ * @param audioData - PCM16 mono audio at `sourceSampleRate`.
5791
+ * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
5792
+ */
5793
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
5794
+ const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5795
+ if (audioData instanceof Blob) {
5796
+ blobToArrayBuffer(audioData).then((arrayBuffer) => {
5797
+ this.sendAudioInternal(
5798
+ downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
5799
+ );
5800
+ }).catch((error) => {
5801
+ this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
5802
+ });
5803
+ return;
5804
+ }
5805
+ this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
5806
+ }
5685
5807
  sendAudioInternal(audioData) {
5686
5808
  const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5687
5809
  if (bytes === 0) return;
@@ -5828,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5828
5950
  model: this.config.asrRequestConfig.model,
5829
5951
  language: this.config.asrRequestConfig.language?.toString() || "en",
5830
5952
  sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
5831
- encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
5953
+ encoding: this.config.asrRequestConfig.encoding,
5832
5954
  interimResults: this.config.asrRequestConfig.interimResults ?? false,
5833
5955
  // Auto-enable useContext if gameContext is provided, or use explicit value if set
5834
5956
  useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -5853,6 +5975,12 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5853
5975
  // Streaming audio metrics opt-in (ms interval). Server only forwards metrics if > 0.
5854
5976
  ...this.config.asrRequestConfig.audioMetricsIntervalMs !== void 0 && {
5855
5977
  audioMetricsIntervalMs: this.config.asrRequestConfig.audioMetricsIntervalMs
5978
+ },
5979
+ // Opt-in: round-trip Deepgram nova-2 search-phrase hits into the
5980
+ // transcript. Only fires server-side when (model = nova-2) AND
5981
+ // (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch.
5982
+ ...this.config.asrRequestConfig.appendSearch !== void 0 && {
5983
+ appendSearch: this.config.asrRequestConfig.appendSearch
5856
5984
  }
5857
5985
  };
5858
5986
  super.sendMessage(
@@ -5986,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5986
6114
  */
5987
6115
  sendAudioNow(audioData) {
5988
6116
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5989
- const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6117
+ const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
5990
6118
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5991
6119
  super.sendAudio(
5992
6120
  audioData,
@@ -6054,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
6054
6182
  sendPrefixAudioNow(audioData) {
6055
6183
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
6056
6184
  if (byteLength === 0) return;
6057
- const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6185
+ const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
6058
6186
  const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
6059
6187
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
6060
6188
  this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
@@ -6582,17 +6710,28 @@ var SimplifiedVGFRecognitionClient = class {
6582
6710
  await this.client.connect();
6583
6711
  }
6584
6712
  sendAudio(audioData) {
6585
- if (!this.isRecordingAudio) {
6586
- this.isRecordingAudio = true;
6587
- this.state = {
6588
- ...this.state,
6589
- startRecordingStatus: "RECORDING",
6590
- startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
6591
- };
6592
- this.notifyStateChange();
6593
- }
6713
+ this.markRecordingStarted();
6594
6714
  this.client.sendAudio(audioData);
6595
6715
  }
6716
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
6717
+ this.markRecordingStarted();
6718
+ this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
6719
+ }
6720
+ /**
6721
+ * Set VGF recording status to RECORDING on the first audio chunk.
6722
+ * Idempotent — subsequent calls are no-ops until disconnect/stop resets
6723
+ * `isRecordingAudio`.
6724
+ */
6725
+ markRecordingStarted() {
6726
+ if (this.isRecordingAudio) return;
6727
+ this.isRecordingAudio = true;
6728
+ this.state = {
6729
+ ...this.state,
6730
+ startRecordingStatus: "RECORDING",
6731
+ startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
6732
+ };
6733
+ this.notifyStateChange();
6734
+ }
6596
6735
  async stopRecording() {
6597
6736
  this.isRecordingAudio = false;
6598
6737
  this.state = updateStateOnStop(this.state);
@@ -6704,6 +6843,7 @@ function createSimplifiedVGFClient(config) {
6704
6843
  return new SimplifiedVGFRecognitionClient(config);
6705
6844
  }
6706
6845
  export {
6846
+ AmazonNovaSonicModel,
6707
6847
  AudioEncoding,
6708
6848
  AwsTranscribeModel,
6709
6849
  BedrockModel,