@volley/recognition-client-sdk 0.1.767 → 0.1.799

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3749,6 +3749,7 @@ var RecognitionProvider;
3749
3749
  RecognitionProvider2["BEDROCK"] = "bedrock";
3750
3750
  RecognitionProvider2["INWORLD_STT"] = "inworld-stt";
3751
3751
  RecognitionProvider2["AWS_TRANSCRIBE"] = "aws-transcribe";
3752
+ RecognitionProvider2["AMAZON_NOVA_SONIC"] = "amazon-nova-sonic";
3752
3753
  RecognitionProvider2["TEST_ASR_PROVIDER_QUOTA"] = "test-asr-provider-quota";
3753
3754
  RecognitionProvider2["TEST_ASR_STREAMING"] = "test-asr-streaming";
3754
3755
  })(RecognitionProvider || (RecognitionProvider = {}));
@@ -3799,6 +3800,7 @@ var ElevenLabsModel;
3799
3800
  })(ElevenLabsModel || (ElevenLabsModel = {}));
3800
3801
  var OpenAIRealtimeModel;
3801
3802
  (function(OpenAIRealtimeModel2) {
3803
+ OpenAIRealtimeModel2["GPT_REALTIME_WHISPER"] = "gpt-realtime-whisper";
3802
3804
  OpenAIRealtimeModel2["GPT_4O_TRANSCRIBE"] = "gpt-4o-transcribe";
3803
3805
  OpenAIRealtimeModel2["GPT_4O_MINI_TRANSCRIBE"] = "gpt-4o-mini-transcribe";
3804
3806
  })(OpenAIRealtimeModel || (OpenAIRealtimeModel = {}));
@@ -3829,8 +3831,15 @@ var AwsTranscribeModel;
3829
3831
  (function(AwsTranscribeModel2) {
3830
3832
  AwsTranscribeModel2["DEFAULT"] = "default";
3831
3833
  })(AwsTranscribeModel || (AwsTranscribeModel = {}));
3834
+ var AmazonNovaSonicModel;
3835
+ (function(AmazonNovaSonicModel2) {
3836
+ AmazonNovaSonicModel2["AMAZON_NOVA_SONIC_V1"] = "amazon.nova-sonic-v1:0";
3837
+ AmazonNovaSonicModel2["AMAZON_NOVA_2_SONIC"] = "amazon.nova-2-sonic-v1:0";
3838
+ })(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
3832
3839
  var SelfServeVllmModel;
3833
3840
  (function(SelfServeVllmModel2) {
3841
+ SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
3842
+ SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
3834
3843
  SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
3835
3844
  })(SelfServeVllmModel || (SelfServeVllmModel = {}));
3836
3845
 
@@ -3845,6 +3854,18 @@ var RecognitionResultTypeV1;
3845
3854
  RecognitionResultTypeV12["AUDIO_METRICS"] = "AudioMetrics";
3846
3855
  RecognitionResultTypeV12["SESSION_CONFIGURED"] = "SessionConfigured";
3847
3856
  })(RecognitionResultTypeV1 || (RecognitionResultTypeV1 = {}));
3857
+ var DetectionTypeV1;
3858
+ (function(DetectionTypeV12) {
3859
+ DetectionTypeV12["SEARCH"] = "search";
3860
+ })(DetectionTypeV1 || (DetectionTypeV1 = {}));
3861
+ var DetectionV1Schema = z.object({
3862
+ type: z.nativeEnum(DetectionTypeV1),
3863
+ query: z.string(),
3864
+ score: z.number().min(0).max(1),
3865
+ startMs: z.number().optional(),
3866
+ endMs: z.number().optional()
3867
+ // Audio time (ms from stream start) where the hit ends
3868
+ });
3848
3869
  var TranscriptionResultSchemaV1 = z.object({
3849
3870
  type: z.literal(RecognitionResultTypeV1.TRANSCRIPTION),
3850
3871
  audioUtteranceId: z.string(),
@@ -3863,8 +3884,9 @@ var TranscriptionResultSchemaV1 = z.object({
3863
3884
  endTimestamp: z.number().optional(),
3864
3885
  receivedAtMs: z.number().optional(),
3865
3886
  accumulatedAudioTimeMs: z.number().optional(),
3866
- rawAudioTimeMs: z.number().optional()
3867
- // Total audio duration sent to provider (includes prefix)
3887
+ rawAudioTimeMs: z.number().optional(),
3888
+ detections: z.array(DetectionV1Schema).optional()
3889
+ // Provider-reported phrase detections (query + score, optionally startMs/endMs). Always populated when the provider returns hits, regardless of `appendSearch`. Other providers leave this undefined.
3868
3890
  });
3869
3891
  var FunctionCallResultSchemaV1 = z.object({
3870
3892
  type: z.literal(RecognitionResultTypeV1.FUNCTION_CALL),
@@ -4117,7 +4139,15 @@ var TranscriptMessageSchema = z.object({
4117
4139
  * @example true
4118
4140
  * @default false
4119
4141
  */
4120
- is_fallback: z.boolean().optional()
4142
+ is_fallback: z.boolean().optional(),
4143
+ /**
4144
+ * Provider-reported phrase detections (query + score, optionally
4145
+ * startMs/endMs). Always populated when the provider returns hits,
4146
+ * regardless of `appendSearch` or scene gating. Other providers leave
4147
+ * this undefined.
4148
+ * @example [{ query: 'justin bieber one time', score: 0.78, startMs: 1200, endMs: 2800 }]
4149
+ */
4150
+ detections: z.array(DetectionV1Schema).optional()
4121
4151
  });
4122
4152
  var VADEndSignalSchema = z.object({
4123
4153
  type: z.literal(ProviderMessageType.VAD_END_SIGNAL),
@@ -4434,6 +4464,9 @@ var ASRRequestSchemaV1 = z.object({
4434
4464
  // Streaming audio metrics opt-in: when > 0, server emits AudioMetrics results throttled to this interval (ms).
4435
4465
  // Undefined / 0 disables streaming audio metrics (final metrics still embedded in Metadata).
4436
4466
  audioMetricsIntervalMs: z.number().optional(),
4467
+ // Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
4468
+ // Active only when (model = deepgram nova-2) AND (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch in asr-config.types.ts for full semantics.
4469
+ appendSearch: z.boolean().optional(),
4437
4470
  // Debug options (FOR DEBUG/TESTING ONLY - not for production use)
4438
4471
  debugCommand: RequestDebugCommandSchema
4439
4472
  });
@@ -4588,6 +4621,21 @@ var AudioEncoding;
4588
4621
  return NAME_TO_ENUM.has(nameStr.toUpperCase());
4589
4622
  }
4590
4623
  AudioEncoding2.isNameValid = isNameValid;
4624
+ function coerce2(value, onStringInput) {
4625
+ if (value === void 0) {
4626
+ return AudioEncoding2.LINEAR16;
4627
+ }
4628
+ if (typeof value === "number") {
4629
+ return value;
4630
+ }
4631
+ const result = fromName(value);
4632
+ if (result === void 0) {
4633
+ throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
4634
+ }
4635
+ onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
4636
+ return result;
4637
+ }
4638
+ AudioEncoding2.coerce = coerce2;
4591
4639
  })(AudioEncoding || (AudioEncoding = {}));
4592
4640
  var PREFIX_AUDIO_ENCODING_OFFSET = 128;
4593
4641
  var SampleRate;
@@ -5313,6 +5361,37 @@ var MessageHandler = class {
5313
5361
  }
5314
5362
  };
5315
5363
 
5364
+ // src/utils/audio-resampler.ts
5365
+ function downsamplePcm16(input, srcRate, targetRate) {
5366
+ if (targetRate > srcRate) {
5367
+ throw new Error(
5368
+ `downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
5369
+ );
5370
+ }
5371
+ const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
5372
+ const src = new Int16Array(buffer);
5373
+ if (srcRate === targetRate || src.length === 0) {
5374
+ return src.slice().buffer;
5375
+ }
5376
+ const ratio = srcRate / targetRate;
5377
+ const dstLen = Math.floor(src.length / ratio);
5378
+ const dst = new Int16Array(dstLen);
5379
+ for (let i = 0; i < dstLen; i++) {
5380
+ const startPos = i * ratio;
5381
+ const endPos = (i + 1) * ratio;
5382
+ const startIdx = Math.floor(startPos);
5383
+ const endIdx = Math.min(Math.ceil(endPos), src.length);
5384
+ let sum = 0;
5385
+ let count = 0;
5386
+ for (let j = startIdx; j < endIdx; j++) {
5387
+ sum += src[j] ?? 0;
5388
+ count++;
5389
+ }
5390
+ dst[i] = count > 0 ? Math.round(sum / count) : 0;
5391
+ }
5392
+ return dst.buffer;
5393
+ }
5394
+
5316
5395
  // src/errors.ts
5317
5396
  var RecognitionError = class extends Error {
5318
5397
  constructor(errorType, message) {
@@ -5387,10 +5466,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5387
5466
  const retryConfig = config.connectionRetry || {};
5388
5467
  const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
5389
5468
  const delayMs = retryConfig.delayMs ?? 200;
5469
+ const normalizedASRConfig = config.asrRequestConfig ? {
5470
+ ...config.asrRequestConfig,
5471
+ encoding: AudioEncoding.coerce(
5472
+ config.asrRequestConfig.encoding,
5473
+ (warning) => config.logger?.("warn", warning)
5474
+ )
5475
+ } : void 0;
5390
5476
  this.config = {
5391
5477
  url,
5392
5478
  audioUtteranceId,
5393
- ...config.asrRequestConfig && { asrRequestConfig: config.asrRequestConfig },
5479
+ ...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
5394
5480
  ...config.gameContext && { gameContext: config.gameContext },
5395
5481
  ...config.callbackUrls && { callbackUrls: config.callbackUrls },
5396
5482
  onTranscript: config.onTranscript || (() => {
@@ -5588,6 +5674,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5588
5674
  }
5589
5675
  this.sendAudioInternal(audioData);
5590
5676
  }
5677
+ /**
5678
+ * Send PCM16 mono audio captured at any sample rate. The SDK downsamples
5679
+ * to the session's target rate (currently 16 kHz per server validator)
5680
+ * before sending.
5681
+ *
5682
+ * Use this when your capture pipeline produces audio at the system's
5683
+ * native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
5684
+ * desktop/mobile hardware — and you don't want to bring your own
5685
+ * resampler. If your audio is already at the target rate, prefer
5686
+ * `sendAudio()` to skip the resample step.
5687
+ *
5688
+ * Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
5689
+ * dependencies, has a built-in low-pass effect so aliasing stays out of
5690
+ * the speech band. Suitable for ASR; not a substitute for a high-quality
5691
+ * resampler if you're doing music or full-fidelity processing.
5692
+ *
5693
+ * Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
5694
+ * mixed to mono by the caller.
5695
+ *
5696
+ * @param audioData - PCM16 mono audio at `sourceSampleRate`.
5697
+ * @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
5698
+ */
5699
+ sendAudioWithSampleRate(audioData, sourceSampleRate) {
5700
+ const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5701
+ if (audioData instanceof Blob) {
5702
+ blobToArrayBuffer(audioData).then((arrayBuffer) => {
5703
+ this.sendAudioInternal(
5704
+ downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
5705
+ );
5706
+ }).catch((error) => {
5707
+ this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
5708
+ });
5709
+ return;
5710
+ }
5711
+ this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
5712
+ }
5591
5713
  sendAudioInternal(audioData) {
5592
5714
  const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5593
5715
  if (bytes === 0) return;
@@ -5734,7 +5856,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5734
5856
  model: this.config.asrRequestConfig.model,
5735
5857
  language: this.config.asrRequestConfig.language?.toString() || "en",
5736
5858
  sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
5737
- encoding: typeof this.config.asrRequestConfig.encoding === "number" ? this.config.asrRequestConfig.encoding : AudioEncoding.LINEAR16,
5859
+ encoding: this.config.asrRequestConfig.encoding,
5738
5860
  interimResults: this.config.asrRequestConfig.interimResults ?? false,
5739
5861
  // Auto-enable useContext if gameContext is provided, or use explicit value if set
5740
5862
  useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
@@ -5759,6 +5881,12 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5759
5881
  // Streaming audio metrics opt-in (ms interval). Server only forwards metrics if > 0.
5760
5882
  ...this.config.asrRequestConfig.audioMetricsIntervalMs !== void 0 && {
5761
5883
  audioMetricsIntervalMs: this.config.asrRequestConfig.audioMetricsIntervalMs
5884
+ },
5885
+ // Opt-in: round-trip Deepgram nova-2 search-phrase hits into the
5886
+ // transcript. Only fires server-side when (model = nova-2) AND
5887
+ // (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch.
5888
+ ...this.config.asrRequestConfig.appendSearch !== void 0 && {
5889
+ appendSearch: this.config.asrRequestConfig.appendSearch
5762
5890
  }
5763
5891
  };
5764
5892
  super.sendMessage(
@@ -5892,7 +6020,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5892
6020
  */
5893
6021
  sendAudioNow(audioData) {
5894
6022
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5895
- const encodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6023
+ const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
5896
6024
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5897
6025
  super.sendAudio(
5898
6026
  audioData,
@@ -5960,7 +6088,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
5960
6088
  sendPrefixAudioNow(audioData) {
5961
6089
  const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
5962
6090
  if (byteLength === 0) return;
5963
- const baseEncodingId = this.config.asrRequestConfig?.encoding || AudioEncoding.LINEAR16;
6091
+ const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
5964
6092
  const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
5965
6093
  const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
5966
6094
  this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });