@volley/recognition-client-sdk 0.1.767 → 0.1.782

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -152,9 +152,12 @@ builder
152
152
  .onError(error => {}) // Handle errors
153
153
  .onConnected(() => {}) // Connection established
154
154
  .onDisconnected((code) => {}) // Connection closed
155
- .onMetadata(meta => {}) // Timing information
155
+ .onMetadata(meta => {}) // Timing information + final audio metrics (always-on)
156
+ .onAudioMetrics(m => {}) // Live audio-quality metrics (opt-in, since 0.1.767)
156
157
  ```
157
158
 
159
+ > **Audio metrics**: every session delivers a final `audioMetrics` snapshot embedded in `Metadata` (volume, silence ratio, clipping, SNR — all derived from PCM, not from the ASR provider). To also receive live per-chunk updates while audio is flowing, set `asrRequestConfig.audioMetricsIntervalMs > 0` and register `.onAudioMetrics()`. Available in SDK **≥ 0.1.767**. See [audio-metrics-alpha.md](https://github.com/Volley-Inc/recognition-service/blob/dev/docs/design/functional-features/observability/audio-metrics-alpha.md) for the full schema, and the repo-root [CHANGELOG.md](https://github.com/Volley-Inc/recognition-service/blob/dev/CHANGELOG.md) for SDK release history.
160
+
158
161
  ### Optional Parameters
159
162
 
160
163
  ```typescript
@@ -24,6 +24,7 @@ declare enum RecognitionProvider {
24
24
  BEDROCK = "bedrock",
25
25
  INWORLD_STT = "inworld-stt",
26
26
  AWS_TRANSCRIBE = "aws-transcribe",
27
+ AMAZON_NOVA_SONIC = "amazon-nova-sonic",
27
28
  TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
28
29
  TEST_ASR_STREAMING = "test-asr-streaming"
29
30
  }
@@ -96,6 +97,7 @@ declare enum ElevenLabsModel {
96
97
  * @see https://platform.openai.com/docs/models/gpt-4o-transcribe
97
98
  */
98
99
  declare enum OpenAIRealtimeModel {
100
+ GPT_REALTIME_WHISPER = "gpt-realtime-whisper",
99
101
  GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
100
102
  GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
101
103
  }
@@ -146,6 +148,15 @@ declare enum InworldSttModel {
146
148
  declare enum AwsTranscribeModel {
147
149
  DEFAULT = "default"
148
150
  }
151
+ /**
152
+ * Amazon Nova Sonic bidirectional streaming model (Bedrock).
153
+ * Speech-to-speech model; we consume the USER FINAL transcript and discard the assistant text/audio output.
154
+ * @see https://docs.aws.amazon.com/nova/latest/userguide/speech-bidirection.html
155
+ */
156
+ declare enum AmazonNovaSonicModel {
157
+ AMAZON_NOVA_SONIC_V1 = "amazon.nova-sonic-v1:0",
158
+ AMAZON_NOVA_2_SONIC = "amazon.nova-2-sonic-v1:0"
159
+ }
149
160
  /**
150
161
  * Self-serve vLLM batch transcription models
151
162
  * Backed by recognition-inference / RunPod `/transcribe`
@@ -156,125 +167,7 @@ declare enum SelfServeVllmModel {
156
167
  /**
157
168
  * Type alias for any model from any provider
158
169
  */
159
- type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | GladiaModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | CartesiaModel | DashScopeModel | InworldSttModel | SelfServeVllmModel | BedrockModel | AwsTranscribeModel | string;
160
-
161
- /**
162
- * Audio encoding types
163
- */
164
- declare enum AudioEncoding {
165
- ENCODING_UNSPECIFIED = 0,
166
- LINEAR16 = 1,
167
- OGG_OPUS = 2,
168
- FLAC = 3,
169
- MULAW = 4,
170
- ALAW = 5
171
- }
172
- declare namespace AudioEncoding {
173
- /**
174
- * Convert numeric ID to AudioEncoding enum
175
- * @param id - Numeric encoding identifier (0-5)
176
- * @returns AudioEncoding enum value or undefined if invalid
177
- */
178
- function fromId(id: number): AudioEncoding | undefined;
179
- /**
180
- * Convert string name to AudioEncoding enum
181
- * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
182
- * @returns AudioEncoding enum value or undefined if invalid
183
- */
184
- function fromName(nameStr: string): AudioEncoding | undefined;
185
- /**
186
- * Convert AudioEncoding enum to numeric ID
187
- * @param encoding - AudioEncoding enum value
188
- * @returns Numeric ID (0-5)
189
- */
190
- function toId(encoding: AudioEncoding): number;
191
- /**
192
- * Convert AudioEncoding enum to string name
193
- * @param encoding - AudioEncoding enum value
194
- * @returns String name like "LINEAR16", "MULAW", etc.
195
- */
196
- function toName(encoding: AudioEncoding): string;
197
- /**
198
- * Check if a numeric ID is a valid encoding
199
- * @param id - Numeric identifier to validate
200
- * @returns true if valid encoding ID
201
- */
202
- function isIdValid(id: number): boolean;
203
- /**
204
- * Check if a string name is a valid encoding
205
- * @param nameStr - String name to validate
206
- * @returns true if valid encoding name
207
- */
208
- function isNameValid(nameStr: string): boolean;
209
- }
210
- /**
211
- * Common sample rates (in Hz)
212
- */
213
- declare enum SampleRate {
214
- RATE_8000 = 8000,
215
- RATE_16000 = 16000,
216
- RATE_22050 = 22050,
217
- RATE_24000 = 24000,
218
- RATE_32000 = 32000,
219
- RATE_44100 = 44100,
220
- RATE_48000 = 48000
221
- }
222
- declare namespace SampleRate {
223
- /**
224
- * Convert Hz value to SampleRate enum
225
- * @param hz - Sample rate in Hz (8000, 16000, etc.)
226
- * @returns SampleRate enum value or undefined if invalid
227
- */
228
- function fromHz(hz: number): SampleRate | undefined;
229
- /**
230
- * Convert string name to SampleRate enum
231
- * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
232
- * @returns SampleRate enum value or undefined if invalid
233
- */
234
- function fromName(nameStr: string): SampleRate | undefined;
235
- /**
236
- * Convert SampleRate enum to Hz value
237
- * @param rate - SampleRate enum value
238
- * @returns Hz value (8000, 16000, etc.)
239
- */
240
- function toHz(rate: SampleRate): number;
241
- /**
242
- * Convert SampleRate enum to string name
243
- * @param rate - SampleRate enum value
244
- * @returns String name like "RATE_8000", "RATE_16000", etc.
245
- */
246
- function toName(rate: SampleRate): string;
247
- /**
248
- * Check if a numeric Hz value is a valid sample rate
249
- * @param hz - Hz value to validate
250
- * @returns true if valid sample rate
251
- */
252
- function isHzValid(hz: number): boolean;
253
- /**
254
- * Check if a string name is a valid sample rate
255
- * @param nameStr - String name to validate
256
- * @returns true if valid sample rate name
257
- */
258
- function isNameValid(nameStr: string): boolean;
259
- }
260
- /**
261
- * Supported languages for recognition
262
- * Using BCP-47 language tags
263
- */
264
- declare enum Language {
265
- ENGLISH_US = "en-US",
266
- ENGLISH_GB = "en-GB",
267
- SPANISH_ES = "es-ES",
268
- SPANISH_MX = "es-MX",
269
- FRENCH_FR = "fr-FR",
270
- GERMAN_DE = "de-DE",
271
- ITALIAN_IT = "it-IT",
272
- PORTUGUESE_BR = "pt-BR",
273
- JAPANESE_JP = "ja-JP",
274
- KOREAN_KR = "ko-KR",
275
- CHINESE_CN = "zh-CN",
276
- CHINESE_TW = "zh-TW"
277
- }
170
+ type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | GladiaModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | CartesiaModel | DashScopeModel | InworldSttModel | SelfServeVllmModel | BedrockModel | AwsTranscribeModel | AmazonNovaSonicModel | string;
278
171
 
279
172
  /**
280
173
  * Recognition Result Types V1
@@ -294,6 +187,16 @@ declare enum RecognitionResultTypeV1 {
294
187
  AUDIO_METRICS = "AudioMetrics",
295
188
  SESSION_CONFIGURED = "SessionConfigured"
296
189
  }
190
+ /**
191
+ * Source of a phrase detection — what kind of provider feature produced
192
+ * the hit. Currently only Deepgram's `search` parameter is wired up, so
193
+ * this enum has one value. New entries (e.g. KEYWORDS, KEYTERMS,
194
+ * SPEECH_CONTEXTS) get added when other providers join.
195
+ */
196
+ declare enum DetectionTypeV1 {
197
+ /** Deepgram phonetic phrase match via the `search=…` request parameter */
198
+ SEARCH = "search"
199
+ }
297
200
  /**
298
201
  * Transcription result V1 - contains transcript message
299
202
  * In the long run game side should not need to know it. In the short run it is send back to client.
@@ -318,6 +221,25 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
318
221
  receivedAtMs: z.ZodOptional<z.ZodNumber>;
319
222
  accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
320
223
  rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
224
+ detections: z.ZodOptional<z.ZodArray<z.ZodObject<{
225
+ type: z.ZodNativeEnum<typeof DetectionTypeV1>;
226
+ query: z.ZodString;
227
+ score: z.ZodNumber;
228
+ startMs: z.ZodOptional<z.ZodNumber>;
229
+ endMs: z.ZodOptional<z.ZodNumber>;
230
+ }, "strip", z.ZodTypeAny, {
231
+ type: DetectionTypeV1;
232
+ query: string;
233
+ score: number;
234
+ startMs?: number | undefined;
235
+ endMs?: number | undefined;
236
+ }, {
237
+ type: DetectionTypeV1;
238
+ query: string;
239
+ score: number;
240
+ startMs?: number | undefined;
241
+ endMs?: number | undefined;
242
+ }>, "many">>;
321
243
  }, "strip", z.ZodTypeAny, {
322
244
  type: RecognitionResultTypeV1.TRANSCRIPTION;
323
245
  audioUtteranceId: string;
@@ -337,6 +259,13 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
337
259
  receivedAtMs?: number | undefined;
338
260
  accumulatedAudioTimeMs?: number | undefined;
339
261
  rawAudioTimeMs?: number | undefined;
262
+ detections?: {
263
+ type: DetectionTypeV1;
264
+ query: string;
265
+ score: number;
266
+ startMs?: number | undefined;
267
+ endMs?: number | undefined;
268
+ }[] | undefined;
340
269
  }, {
341
270
  type: RecognitionResultTypeV1.TRANSCRIPTION;
342
271
  audioUtteranceId: string;
@@ -356,6 +285,13 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
356
285
  receivedAtMs?: number | undefined;
357
286
  accumulatedAudioTimeMs?: number | undefined;
358
287
  rawAudioTimeMs?: number | undefined;
288
+ detections?: {
289
+ type: DetectionTypeV1;
290
+ query: string;
291
+ score: number;
292
+ startMs?: number | undefined;
293
+ endMs?: number | undefined;
294
+ }[] | undefined;
359
295
  }>;
360
296
  type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
361
297
  /**
@@ -667,6 +603,124 @@ declare const AudioMetricsResultSchemaV1: z.ZodObject<{
667
603
  }>;
668
604
  type AudioMetricsResultV1 = z.infer<typeof AudioMetricsResultSchemaV1>;
669
605
 
606
+ /**
607
+ * Audio encoding types
608
+ */
609
+ declare enum AudioEncoding {
610
+ ENCODING_UNSPECIFIED = 0,
611
+ LINEAR16 = 1,
612
+ OGG_OPUS = 2,
613
+ FLAC = 3,
614
+ MULAW = 4,
615
+ ALAW = 5
616
+ }
617
+ declare namespace AudioEncoding {
618
+ /**
619
+ * Convert numeric ID to AudioEncoding enum
620
+ * @param id - Numeric encoding identifier (0-5)
621
+ * @returns AudioEncoding enum value or undefined if invalid
622
+ */
623
+ function fromId(id: number): AudioEncoding | undefined;
624
+ /**
625
+ * Convert string name to AudioEncoding enum
626
+ * @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
627
+ * @returns AudioEncoding enum value or undefined if invalid
628
+ */
629
+ function fromName(nameStr: string): AudioEncoding | undefined;
630
+ /**
631
+ * Convert AudioEncoding enum to numeric ID
632
+ * @param encoding - AudioEncoding enum value
633
+ * @returns Numeric ID (0-5)
634
+ */
635
+ function toId(encoding: AudioEncoding): number;
636
+ /**
637
+ * Convert AudioEncoding enum to string name
638
+ * @param encoding - AudioEncoding enum value
639
+ * @returns String name like "LINEAR16", "MULAW", etc.
640
+ */
641
+ function toName(encoding: AudioEncoding): string;
642
+ /**
643
+ * Check if a numeric ID is a valid encoding
644
+ * @param id - Numeric identifier to validate
645
+ * @returns true if valid encoding ID
646
+ */
647
+ function isIdValid(id: number): boolean;
648
+ /**
649
+ * Check if a string name is a valid encoding
650
+ * @param nameStr - String name to validate
651
+ * @returns true if valid encoding name
652
+ */
653
+ function isNameValid(nameStr: string): boolean;
654
+ }
655
+ /**
656
+ * Common sample rates (in Hz)
657
+ */
658
+ declare enum SampleRate {
659
+ RATE_8000 = 8000,
660
+ RATE_16000 = 16000,
661
+ RATE_22050 = 22050,
662
+ RATE_24000 = 24000,
663
+ RATE_32000 = 32000,
664
+ RATE_44100 = 44100,
665
+ RATE_48000 = 48000
666
+ }
667
+ declare namespace SampleRate {
668
+ /**
669
+ * Convert Hz value to SampleRate enum
670
+ * @param hz - Sample rate in Hz (8000, 16000, etc.)
671
+ * @returns SampleRate enum value or undefined if invalid
672
+ */
673
+ function fromHz(hz: number): SampleRate | undefined;
674
+ /**
675
+ * Convert string name to SampleRate enum
676
+ * @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
677
+ * @returns SampleRate enum value or undefined if invalid
678
+ */
679
+ function fromName(nameStr: string): SampleRate | undefined;
680
+ /**
681
+ * Convert SampleRate enum to Hz value
682
+ * @param rate - SampleRate enum value
683
+ * @returns Hz value (8000, 16000, etc.)
684
+ */
685
+ function toHz(rate: SampleRate): number;
686
+ /**
687
+ * Convert SampleRate enum to string name
688
+ * @param rate - SampleRate enum value
689
+ * @returns String name like "RATE_8000", "RATE_16000", etc.
690
+ */
691
+ function toName(rate: SampleRate): string;
692
+ /**
693
+ * Check if a numeric Hz value is a valid sample rate
694
+ * @param hz - Hz value to validate
695
+ * @returns true if valid sample rate
696
+ */
697
+ function isHzValid(hz: number): boolean;
698
+ /**
699
+ * Check if a string name is a valid sample rate
700
+ * @param nameStr - String name to validate
701
+ * @returns true if valid sample rate name
702
+ */
703
+ function isNameValid(nameStr: string): boolean;
704
+ }
705
+ /**
706
+ * Supported languages for recognition
707
+ * Using BCP-47 language tags
708
+ */
709
+ declare enum Language {
710
+ ENGLISH_US = "en-US",
711
+ ENGLISH_GB = "en-GB",
712
+ SPANISH_ES = "es-ES",
713
+ SPANISH_MX = "es-MX",
714
+ FRENCH_FR = "fr-FR",
715
+ GERMAN_DE = "de-DE",
716
+ ITALIAN_IT = "it-IT",
717
+ PORTUGUESE_BR = "pt-BR",
718
+ JAPANESE_JP = "ja-JP",
719
+ KOREAN_KR = "ko-KR",
720
+ CHINESE_CN = "zh-CN",
721
+ CHINESE_TW = "zh-TW"
722
+ }
723
+
670
724
  /**
671
725
  * Recognition Context Types V1
672
726
  * NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
@@ -949,6 +1003,29 @@ interface ASRRequestConfig {
949
1003
  * @example 500
950
1004
  */
951
1005
  audioMetricsIntervalMs?: number;
1006
+ /**
1007
+ * Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
1008
+ *
1009
+ * When `true` AND the resolved provider/model is **deepgram nova-2** AND the
1010
+ * GameContext `gamePhase` is `'Solve Puzzle'`, every Deepgram Results event
1011
+ * with a `channel.search` hit at confidence ≥ 0.6 has the original query
1012
+ * prepended to the transcript text delivered to the client. This restores
1013
+ * parity with the legacy Roku→Deepgram WoF Puzzle-Solve path where the
1014
+ * phrase round-trip lets downstream NLU match multi-word puzzle solutions
1015
+ * even when nova-2's primary transcription drifts.
1016
+ *
1017
+ * Default: `false` (no prepend; transcript is whatever nova-2 produces).
1018
+ *
1019
+ * Scope guard rationale:
1020
+ * - nova-2 only: nova-3 / flux do not need this (they handle phrase
1021
+ * spotting differently and the prepend would only add noise).
1022
+ * - Solve-Puzzle scene only: other WoF scenes (Letter-Guess,
1023
+ * Bonus-Round, etc.) do NOT want the slotMap phrase prepended — only
1024
+ * Puzzle-Solve depends on the phrase round-trip.
1025
+ *
1026
+ * @default false
1027
+ */
1028
+ appendSearch?: boolean;
952
1029
  /**
953
1030
  * Optional fallback ASR configurations
954
1031
  *