@volley/recognition-client-sdk 0.1.767 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/dist/browser.bundled.d.ts +256 -123
- package/dist/index.bundled.d.ts +279 -125
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +156 -16
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +135 -7
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +1 -1
- package/src/index.spec.ts +2 -0
- package/src/index.ts +1 -0
- package/src/recognition-client.ts +71 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
package/README.md
CHANGED
|
@@ -152,9 +152,12 @@ builder
|
|
|
152
152
|
.onError(error => {}) // Handle errors
|
|
153
153
|
.onConnected(() => {}) // Connection established
|
|
154
154
|
.onDisconnected((code) => {}) // Connection closed
|
|
155
|
-
.onMetadata(meta => {}) // Timing information
|
|
155
|
+
.onMetadata(meta => {}) // Timing information + final audio metrics (always-on)
|
|
156
|
+
.onAudioMetrics(m => {}) // Live audio-quality metrics (opt-in, since 0.1.767)
|
|
156
157
|
```
|
|
157
158
|
|
|
159
|
+
> **Audio metrics**: every session delivers a final `audioMetrics` snapshot embedded in `Metadata` (volume, silence ratio, clipping, SNR — all derived from PCM, not from the ASR provider). To also receive live per-chunk updates while audio is flowing, set `asrRequestConfig.audioMetricsIntervalMs > 0` and register `.onAudioMetrics()`. Available in SDK **≥ 0.1.767**. See [audio-metrics-alpha.md](https://github.com/Volley-Inc/recognition-service/blob/dev/docs/design/functional-features/observability/audio-metrics-alpha.md) for the full schema, and the repo-root [CHANGELOG.md](https://github.com/Volley-Inc/recognition-service/blob/dev/CHANGELOG.md) for SDK release history.
|
|
160
|
+
|
|
158
161
|
### Optional Parameters
|
|
159
162
|
|
|
160
163
|
```typescript
|
|
@@ -24,6 +24,7 @@ declare enum RecognitionProvider {
|
|
|
24
24
|
BEDROCK = "bedrock",
|
|
25
25
|
INWORLD_STT = "inworld-stt",
|
|
26
26
|
AWS_TRANSCRIBE = "aws-transcribe",
|
|
27
|
+
AMAZON_NOVA_SONIC = "amazon-nova-sonic",
|
|
27
28
|
TEST_ASR_PROVIDER_QUOTA = "test-asr-provider-quota",
|
|
28
29
|
TEST_ASR_STREAMING = "test-asr-streaming"
|
|
29
30
|
}
|
|
@@ -96,6 +97,7 @@ declare enum ElevenLabsModel {
|
|
|
96
97
|
* @see https://platform.openai.com/docs/models/gpt-4o-transcribe
|
|
97
98
|
*/
|
|
98
99
|
declare enum OpenAIRealtimeModel {
|
|
100
|
+
GPT_REALTIME_WHISPER = "gpt-realtime-whisper",
|
|
99
101
|
GPT_4O_TRANSCRIBE = "gpt-4o-transcribe",
|
|
100
102
|
GPT_4O_MINI_TRANSCRIBE = "gpt-4o-mini-transcribe"
|
|
101
103
|
}
|
|
@@ -146,135 +148,28 @@ declare enum InworldSttModel {
|
|
|
146
148
|
declare enum AwsTranscribeModel {
|
|
147
149
|
DEFAULT = "default"
|
|
148
150
|
}
|
|
151
|
+
/**
|
|
152
|
+
* Amazon Nova Sonic bidirectional streaming model (Bedrock).
|
|
153
|
+
* Speech-to-speech model; we consume the USER FINAL transcript and discard the assistant text/audio output.
|
|
154
|
+
* @see https://docs.aws.amazon.com/nova/latest/userguide/speech-bidirection.html
|
|
155
|
+
*/
|
|
156
|
+
declare enum AmazonNovaSonicModel {
|
|
157
|
+
AMAZON_NOVA_SONIC_V1 = "amazon.nova-sonic-v1:0",
|
|
158
|
+
AMAZON_NOVA_2_SONIC = "amazon.nova-2-sonic-v1:0"
|
|
159
|
+
}
|
|
149
160
|
/**
|
|
150
161
|
* Self-serve vLLM batch transcription models
|
|
151
|
-
* Backed by recognition-inference / RunPod `/transcribe`
|
|
162
|
+
* Backed by recognition-inference / RunPod `/ws/transcribe`
|
|
152
163
|
*/
|
|
153
164
|
declare enum SelfServeVllmModel {
|
|
165
|
+
QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
|
|
166
|
+
QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
|
|
154
167
|
QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
|
|
155
168
|
}
|
|
156
169
|
/**
|
|
157
170
|
* Type alias for any model from any provider
|
|
158
171
|
*/
|
|
159
|
-
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | GladiaModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | CartesiaModel | DashScopeModel | InworldSttModel | SelfServeVllmModel | BedrockModel | AwsTranscribeModel | string;
|
|
160
|
-
|
|
161
|
-
/**
|
|
162
|
-
* Audio encoding types
|
|
163
|
-
*/
|
|
164
|
-
declare enum AudioEncoding {
|
|
165
|
-
ENCODING_UNSPECIFIED = 0,
|
|
166
|
-
LINEAR16 = 1,
|
|
167
|
-
OGG_OPUS = 2,
|
|
168
|
-
FLAC = 3,
|
|
169
|
-
MULAW = 4,
|
|
170
|
-
ALAW = 5
|
|
171
|
-
}
|
|
172
|
-
declare namespace AudioEncoding {
|
|
173
|
-
/**
|
|
174
|
-
* Convert numeric ID to AudioEncoding enum
|
|
175
|
-
* @param id - Numeric encoding identifier (0-5)
|
|
176
|
-
* @returns AudioEncoding enum value or undefined if invalid
|
|
177
|
-
*/
|
|
178
|
-
function fromId(id: number): AudioEncoding | undefined;
|
|
179
|
-
/**
|
|
180
|
-
* Convert string name to AudioEncoding enum
|
|
181
|
-
* @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
|
|
182
|
-
* @returns AudioEncoding enum value or undefined if invalid
|
|
183
|
-
*/
|
|
184
|
-
function fromName(nameStr: string): AudioEncoding | undefined;
|
|
185
|
-
/**
|
|
186
|
-
* Convert AudioEncoding enum to numeric ID
|
|
187
|
-
* @param encoding - AudioEncoding enum value
|
|
188
|
-
* @returns Numeric ID (0-5)
|
|
189
|
-
*/
|
|
190
|
-
function toId(encoding: AudioEncoding): number;
|
|
191
|
-
/**
|
|
192
|
-
* Convert AudioEncoding enum to string name
|
|
193
|
-
* @param encoding - AudioEncoding enum value
|
|
194
|
-
* @returns String name like "LINEAR16", "MULAW", etc.
|
|
195
|
-
*/
|
|
196
|
-
function toName(encoding: AudioEncoding): string;
|
|
197
|
-
/**
|
|
198
|
-
* Check if a numeric ID is a valid encoding
|
|
199
|
-
* @param id - Numeric identifier to validate
|
|
200
|
-
* @returns true if valid encoding ID
|
|
201
|
-
*/
|
|
202
|
-
function isIdValid(id: number): boolean;
|
|
203
|
-
/**
|
|
204
|
-
* Check if a string name is a valid encoding
|
|
205
|
-
* @param nameStr - String name to validate
|
|
206
|
-
* @returns true if valid encoding name
|
|
207
|
-
*/
|
|
208
|
-
function isNameValid(nameStr: string): boolean;
|
|
209
|
-
}
|
|
210
|
-
/**
|
|
211
|
-
* Common sample rates (in Hz)
|
|
212
|
-
*/
|
|
213
|
-
declare enum SampleRate {
|
|
214
|
-
RATE_8000 = 8000,
|
|
215
|
-
RATE_16000 = 16000,
|
|
216
|
-
RATE_22050 = 22050,
|
|
217
|
-
RATE_24000 = 24000,
|
|
218
|
-
RATE_32000 = 32000,
|
|
219
|
-
RATE_44100 = 44100,
|
|
220
|
-
RATE_48000 = 48000
|
|
221
|
-
}
|
|
222
|
-
declare namespace SampleRate {
|
|
223
|
-
/**
|
|
224
|
-
* Convert Hz value to SampleRate enum
|
|
225
|
-
* @param hz - Sample rate in Hz (8000, 16000, etc.)
|
|
226
|
-
* @returns SampleRate enum value or undefined if invalid
|
|
227
|
-
*/
|
|
228
|
-
function fromHz(hz: number): SampleRate | undefined;
|
|
229
|
-
/**
|
|
230
|
-
* Convert string name to SampleRate enum
|
|
231
|
-
* @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
|
|
232
|
-
* @returns SampleRate enum value or undefined if invalid
|
|
233
|
-
*/
|
|
234
|
-
function fromName(nameStr: string): SampleRate | undefined;
|
|
235
|
-
/**
|
|
236
|
-
* Convert SampleRate enum to Hz value
|
|
237
|
-
* @param rate - SampleRate enum value
|
|
238
|
-
* @returns Hz value (8000, 16000, etc.)
|
|
239
|
-
*/
|
|
240
|
-
function toHz(rate: SampleRate): number;
|
|
241
|
-
/**
|
|
242
|
-
* Convert SampleRate enum to string name
|
|
243
|
-
* @param rate - SampleRate enum value
|
|
244
|
-
* @returns String name like "RATE_8000", "RATE_16000", etc.
|
|
245
|
-
*/
|
|
246
|
-
function toName(rate: SampleRate): string;
|
|
247
|
-
/**
|
|
248
|
-
* Check if a numeric Hz value is a valid sample rate
|
|
249
|
-
* @param hz - Hz value to validate
|
|
250
|
-
* @returns true if valid sample rate
|
|
251
|
-
*/
|
|
252
|
-
function isHzValid(hz: number): boolean;
|
|
253
|
-
/**
|
|
254
|
-
* Check if a string name is a valid sample rate
|
|
255
|
-
* @param nameStr - String name to validate
|
|
256
|
-
* @returns true if valid sample rate name
|
|
257
|
-
*/
|
|
258
|
-
function isNameValid(nameStr: string): boolean;
|
|
259
|
-
}
|
|
260
|
-
/**
|
|
261
|
-
* Supported languages for recognition
|
|
262
|
-
* Using BCP-47 language tags
|
|
263
|
-
*/
|
|
264
|
-
declare enum Language {
|
|
265
|
-
ENGLISH_US = "en-US",
|
|
266
|
-
ENGLISH_GB = "en-GB",
|
|
267
|
-
SPANISH_ES = "es-ES",
|
|
268
|
-
SPANISH_MX = "es-MX",
|
|
269
|
-
FRENCH_FR = "fr-FR",
|
|
270
|
-
GERMAN_DE = "de-DE",
|
|
271
|
-
ITALIAN_IT = "it-IT",
|
|
272
|
-
PORTUGUESE_BR = "pt-BR",
|
|
273
|
-
JAPANESE_JP = "ja-JP",
|
|
274
|
-
KOREAN_KR = "ko-KR",
|
|
275
|
-
CHINESE_CN = "zh-CN",
|
|
276
|
-
CHINESE_TW = "zh-TW"
|
|
277
|
-
}
|
|
172
|
+
type RecognitionModel = DeepgramModel | GoogleModel | FireworksModel | GladiaModel | ElevenLabsModel | OpenAIRealtimeModel | MistralVoxtralModel | CartesiaModel | DashScopeModel | InworldSttModel | SelfServeVllmModel | BedrockModel | AwsTranscribeModel | AmazonNovaSonicModel | string;
|
|
278
173
|
|
|
279
174
|
/**
|
|
280
175
|
* Recognition Result Types V1
|
|
@@ -294,6 +189,16 @@ declare enum RecognitionResultTypeV1 {
|
|
|
294
189
|
AUDIO_METRICS = "AudioMetrics",
|
|
295
190
|
SESSION_CONFIGURED = "SessionConfigured"
|
|
296
191
|
}
|
|
192
|
+
/**
|
|
193
|
+
* Source of a phrase detection — what kind of provider feature produced
|
|
194
|
+
* the hit. Currently only Deepgram's `search` parameter is wired up, so
|
|
195
|
+
* this enum has one value. New entries (e.g. KEYWORDS, KEYTERMS,
|
|
196
|
+
* SPEECH_CONTEXTS) get added when other providers join.
|
|
197
|
+
*/
|
|
198
|
+
declare enum DetectionTypeV1 {
|
|
199
|
+
/** Deepgram phonetic phrase match via the `search=…` request parameter */
|
|
200
|
+
SEARCH = "search"
|
|
201
|
+
}
|
|
297
202
|
/**
|
|
298
203
|
* Transcription result V1 - contains transcript message
|
|
299
204
|
* In the long run game side should not need to know it. In the short run it is send back to client.
|
|
@@ -318,6 +223,25 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
318
223
|
receivedAtMs: z.ZodOptional<z.ZodNumber>;
|
|
319
224
|
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
320
225
|
rawAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
226
|
+
detections: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
227
|
+
type: z.ZodNativeEnum<typeof DetectionTypeV1>;
|
|
228
|
+
query: z.ZodString;
|
|
229
|
+
score: z.ZodNumber;
|
|
230
|
+
startMs: z.ZodOptional<z.ZodNumber>;
|
|
231
|
+
endMs: z.ZodOptional<z.ZodNumber>;
|
|
232
|
+
}, "strip", z.ZodTypeAny, {
|
|
233
|
+
type: DetectionTypeV1;
|
|
234
|
+
query: string;
|
|
235
|
+
score: number;
|
|
236
|
+
startMs?: number | undefined;
|
|
237
|
+
endMs?: number | undefined;
|
|
238
|
+
}, {
|
|
239
|
+
type: DetectionTypeV1;
|
|
240
|
+
query: string;
|
|
241
|
+
score: number;
|
|
242
|
+
startMs?: number | undefined;
|
|
243
|
+
endMs?: number | undefined;
|
|
244
|
+
}>, "many">>;
|
|
321
245
|
}, "strip", z.ZodTypeAny, {
|
|
322
246
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
323
247
|
audioUtteranceId: string;
|
|
@@ -337,6 +261,13 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
337
261
|
receivedAtMs?: number | undefined;
|
|
338
262
|
accumulatedAudioTimeMs?: number | undefined;
|
|
339
263
|
rawAudioTimeMs?: number | undefined;
|
|
264
|
+
detections?: {
|
|
265
|
+
type: DetectionTypeV1;
|
|
266
|
+
query: string;
|
|
267
|
+
score: number;
|
|
268
|
+
startMs?: number | undefined;
|
|
269
|
+
endMs?: number | undefined;
|
|
270
|
+
}[] | undefined;
|
|
340
271
|
}, {
|
|
341
272
|
type: RecognitionResultTypeV1.TRANSCRIPTION;
|
|
342
273
|
audioUtteranceId: string;
|
|
@@ -356,6 +287,13 @@ declare const TranscriptionResultSchemaV1: z.ZodObject<{
|
|
|
356
287
|
receivedAtMs?: number | undefined;
|
|
357
288
|
accumulatedAudioTimeMs?: number | undefined;
|
|
358
289
|
rawAudioTimeMs?: number | undefined;
|
|
290
|
+
detections?: {
|
|
291
|
+
type: DetectionTypeV1;
|
|
292
|
+
query: string;
|
|
293
|
+
score: number;
|
|
294
|
+
startMs?: number | undefined;
|
|
295
|
+
endMs?: number | undefined;
|
|
296
|
+
}[] | undefined;
|
|
359
297
|
}>;
|
|
360
298
|
type TranscriptionResultV1 = z.infer<typeof TranscriptionResultSchemaV1>;
|
|
361
299
|
/**
|
|
@@ -667,6 +605,138 @@ declare const AudioMetricsResultSchemaV1: z.ZodObject<{
|
|
|
667
605
|
}>;
|
|
668
606
|
type AudioMetricsResultV1 = z.infer<typeof AudioMetricsResultSchemaV1>;
|
|
669
607
|
|
|
608
|
+
/**
|
|
609
|
+
* Audio encoding types
|
|
610
|
+
*/
|
|
611
|
+
declare enum AudioEncoding {
|
|
612
|
+
ENCODING_UNSPECIFIED = 0,
|
|
613
|
+
LINEAR16 = 1,
|
|
614
|
+
OGG_OPUS = 2,
|
|
615
|
+
FLAC = 3,
|
|
616
|
+
MULAW = 4,
|
|
617
|
+
ALAW = 5
|
|
618
|
+
}
|
|
619
|
+
declare namespace AudioEncoding {
|
|
620
|
+
/**
|
|
621
|
+
* Convert numeric ID to AudioEncoding enum
|
|
622
|
+
* @param id - Numeric encoding identifier (0-5)
|
|
623
|
+
* @returns AudioEncoding enum value or undefined if invalid
|
|
624
|
+
*/
|
|
625
|
+
function fromId(id: number): AudioEncoding | undefined;
|
|
626
|
+
/**
|
|
627
|
+
* Convert string name to AudioEncoding enum
|
|
628
|
+
* @param nameStr - String name like "linear16", "LINEAR16", "ogg_opus", "OGG_OPUS", etc. (case insensitive)
|
|
629
|
+
* @returns AudioEncoding enum value or undefined if invalid
|
|
630
|
+
*/
|
|
631
|
+
function fromName(nameStr: string): AudioEncoding | undefined;
|
|
632
|
+
/**
|
|
633
|
+
* Convert AudioEncoding enum to numeric ID
|
|
634
|
+
* @param encoding - AudioEncoding enum value
|
|
635
|
+
* @returns Numeric ID (0-5)
|
|
636
|
+
*/
|
|
637
|
+
function toId(encoding: AudioEncoding): number;
|
|
638
|
+
/**
|
|
639
|
+
* Convert AudioEncoding enum to string name
|
|
640
|
+
* @param encoding - AudioEncoding enum value
|
|
641
|
+
* @returns String name like "LINEAR16", "MULAW", etc.
|
|
642
|
+
*/
|
|
643
|
+
function toName(encoding: AudioEncoding): string;
|
|
644
|
+
/**
|
|
645
|
+
* Check if a numeric ID is a valid encoding
|
|
646
|
+
* @param id - Numeric identifier to validate
|
|
647
|
+
* @returns true if valid encoding ID
|
|
648
|
+
*/
|
|
649
|
+
function isIdValid(id: number): boolean;
|
|
650
|
+
/**
|
|
651
|
+
* Check if a string name is a valid encoding
|
|
652
|
+
* @param nameStr - String name to validate
|
|
653
|
+
* @returns true if valid encoding name
|
|
654
|
+
*/
|
|
655
|
+
function isNameValid(nameStr: string): boolean;
|
|
656
|
+
/**
|
|
657
|
+
* Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
|
|
658
|
+
*
|
|
659
|
+
* - enum / number → returned as-is (already AudioEncoding-shaped)
|
|
660
|
+
* - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
|
|
661
|
+
* Invokes `onStringInput` with a warning message so callers can route it
|
|
662
|
+
* to their preferred logger.
|
|
663
|
+
* - invalid string → throws (preferred over silent fallback so typos surface)
|
|
664
|
+
* - undefined → defaults to {@link AudioEncoding.LINEAR16}
|
|
665
|
+
*
|
|
666
|
+
* Always normalize at the SDK / server boundary so downstream code can rely
|
|
667
|
+
* on a numeric AudioEncoding (the wire-level binary frame header is uint32).
|
|
668
|
+
*/
|
|
669
|
+
function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
|
|
670
|
+
}
|
|
671
|
+
/**
|
|
672
|
+
* Common sample rates (in Hz)
|
|
673
|
+
*/
|
|
674
|
+
declare enum SampleRate {
|
|
675
|
+
RATE_8000 = 8000,
|
|
676
|
+
RATE_16000 = 16000,
|
|
677
|
+
RATE_22050 = 22050,
|
|
678
|
+
RATE_24000 = 24000,
|
|
679
|
+
RATE_32000 = 32000,
|
|
680
|
+
RATE_44100 = 44100,
|
|
681
|
+
RATE_48000 = 48000
|
|
682
|
+
}
|
|
683
|
+
declare namespace SampleRate {
|
|
684
|
+
/**
|
|
685
|
+
* Convert Hz value to SampleRate enum
|
|
686
|
+
* @param hz - Sample rate in Hz (8000, 16000, etc.)
|
|
687
|
+
* @returns SampleRate enum value or undefined if invalid
|
|
688
|
+
*/
|
|
689
|
+
function fromHz(hz: number): SampleRate | undefined;
|
|
690
|
+
/**
|
|
691
|
+
* Convert string name to SampleRate enum
|
|
692
|
+
* @param nameStr - String name like "rate_8000", "RATE_16000", etc. (case insensitive)
|
|
693
|
+
* @returns SampleRate enum value or undefined if invalid
|
|
694
|
+
*/
|
|
695
|
+
function fromName(nameStr: string): SampleRate | undefined;
|
|
696
|
+
/**
|
|
697
|
+
* Convert SampleRate enum to Hz value
|
|
698
|
+
* @param rate - SampleRate enum value
|
|
699
|
+
* @returns Hz value (8000, 16000, etc.)
|
|
700
|
+
*/
|
|
701
|
+
function toHz(rate: SampleRate): number;
|
|
702
|
+
/**
|
|
703
|
+
* Convert SampleRate enum to string name
|
|
704
|
+
* @param rate - SampleRate enum value
|
|
705
|
+
* @returns String name like "RATE_8000", "RATE_16000", etc.
|
|
706
|
+
*/
|
|
707
|
+
function toName(rate: SampleRate): string;
|
|
708
|
+
/**
|
|
709
|
+
* Check if a numeric Hz value is a valid sample rate
|
|
710
|
+
* @param hz - Hz value to validate
|
|
711
|
+
* @returns true if valid sample rate
|
|
712
|
+
*/
|
|
713
|
+
function isHzValid(hz: number): boolean;
|
|
714
|
+
/**
|
|
715
|
+
* Check if a string name is a valid sample rate
|
|
716
|
+
* @param nameStr - String name to validate
|
|
717
|
+
* @returns true if valid sample rate name
|
|
718
|
+
*/
|
|
719
|
+
function isNameValid(nameStr: string): boolean;
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Supported languages for recognition
|
|
723
|
+
* Using BCP-47 language tags
|
|
724
|
+
*/
|
|
725
|
+
declare enum Language {
|
|
726
|
+
ENGLISH_US = "en-US",
|
|
727
|
+
ENGLISH_GB = "en-GB",
|
|
728
|
+
SPANISH_ES = "es-ES",
|
|
729
|
+
SPANISH_MX = "es-MX",
|
|
730
|
+
FRENCH_FR = "fr-FR",
|
|
731
|
+
GERMAN_DE = "de-DE",
|
|
732
|
+
ITALIAN_IT = "it-IT",
|
|
733
|
+
PORTUGUESE_BR = "pt-BR",
|
|
734
|
+
JAPANESE_JP = "ja-JP",
|
|
735
|
+
KOREAN_KR = "ko-KR",
|
|
736
|
+
CHINESE_CN = "zh-CN",
|
|
737
|
+
CHINESE_TW = "zh-TW"
|
|
738
|
+
}
|
|
739
|
+
|
|
670
740
|
/**
|
|
671
741
|
* Recognition Context Types V1
|
|
672
742
|
* NOTE_TO_AI: DO NOT CHANGE THIS UNLESS EXPLICITLY ASKED. Always ask before making any changes.
|
|
@@ -852,10 +922,10 @@ interface ASRRequestConfig {
|
|
|
852
922
|
* doesn't respond with is_final=true after stopRecording().
|
|
853
923
|
*
|
|
854
924
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
855
|
-
* - balanced: 500ms -
|
|
856
|
-
* - conservative: 1000ms - wait longer for complex utterances
|
|
925
|
+
* - balanced: 500ms - good for most cases
|
|
926
|
+
* - conservative: 1000ms - current default, wait longer for complex utterances
|
|
857
927
|
*
|
|
858
|
-
* @default '
|
|
928
|
+
* @default 'conservative'
|
|
859
929
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
860
930
|
*/
|
|
861
931
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
@@ -949,6 +1019,29 @@ interface ASRRequestConfig {
|
|
|
949
1019
|
* @example 500
|
|
950
1020
|
*/
|
|
951
1021
|
audioMetricsIntervalMs?: number;
|
|
1022
|
+
/**
|
|
1023
|
+
* Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
|
|
1024
|
+
*
|
|
1025
|
+
* When `true` AND the resolved provider/model is **deepgram nova-2** AND the
|
|
1026
|
+
* GameContext `gamePhase` is `'Solve Puzzle'`, every Deepgram Results event
|
|
1027
|
+
* with a `channel.search` hit at confidence ≥ 0.6 has the original query
|
|
1028
|
+
* prepended to the transcript text delivered to the client. This restores
|
|
1029
|
+
* parity with the legacy Roku→Deepgram WoF Puzzle-Solve path where the
|
|
1030
|
+
* phrase round-trip lets downstream NLU match multi-word puzzle solutions
|
|
1031
|
+
* even when nova-2's primary transcription drifts.
|
|
1032
|
+
*
|
|
1033
|
+
* Default: `false` (no prepend; transcript is whatever nova-2 produces).
|
|
1034
|
+
*
|
|
1035
|
+
* Scope guard rationale:
|
|
1036
|
+
* - nova-2 only: nova-3 / flux do not need this (they handle phrase
|
|
1037
|
+
* spotting differently and the prepend would only add noise).
|
|
1038
|
+
* - Solve-Puzzle scene only: other WoF scenes (Letter-Guess,
|
|
1039
|
+
* Bonus-Round, etc.) do NOT want the slotMap phrase prepended — only
|
|
1040
|
+
* Puzzle-Solve depends on the phrase round-trip.
|
|
1041
|
+
*
|
|
1042
|
+
* @default false
|
|
1043
|
+
*/
|
|
1044
|
+
appendSearch?: boolean;
|
|
952
1045
|
/**
|
|
953
1046
|
* Optional fallback ASR configurations
|
|
954
1047
|
*
|
|
@@ -1327,6 +1420,23 @@ interface IRecognitionClient {
|
|
|
1327
1420
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
1328
1421
|
*/
|
|
1329
1422
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1423
|
+
/**
|
|
1424
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
1425
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
1426
|
+
* server validator) before transmitting.
|
|
1427
|
+
*
|
|
1428
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1429
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
1430
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
1431
|
+
* skip the resample step.
|
|
1432
|
+
*
|
|
1433
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1434
|
+
* mixed to mono by the caller.
|
|
1435
|
+
*
|
|
1436
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1437
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1438
|
+
*/
|
|
1439
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1330
1440
|
/**
|
|
1331
1441
|
* Stop recording and wait for final transcript
|
|
1332
1442
|
* The server will close the connection after sending the final transcript.
|
|
@@ -1526,6 +1636,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1526
1636
|
*/
|
|
1527
1637
|
private connectWithRetry;
|
|
1528
1638
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1639
|
+
/**
|
|
1640
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
1641
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
1642
|
+
* before sending.
|
|
1643
|
+
*
|
|
1644
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1645
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
1646
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
1647
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
1648
|
+
* `sendAudio()` to skip the resample step.
|
|
1649
|
+
*
|
|
1650
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
1651
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
1652
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
1653
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
1654
|
+
*
|
|
1655
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1656
|
+
* mixed to mono by the caller.
|
|
1657
|
+
*
|
|
1658
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1659
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1660
|
+
*/
|
|
1661
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1529
1662
|
private sendAudioInternal;
|
|
1530
1663
|
/**
|
|
1531
1664
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|