@volley/recognition-client-sdk 0.1.767 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/dist/browser.bundled.d.ts +256 -123
- package/dist/index.bundled.d.ts +279 -125
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +156 -16
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +135 -7
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +1 -1
- package/src/index.spec.ts +2 -0
- package/src/index.ts +1 -0
- package/src/recognition-client.ts +71 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
|
@@ -3749,6 +3749,7 @@ var RecognitionProvider;
|
|
|
3749
3749
|
RecognitionProvider2["BEDROCK"] = "bedrock";
|
|
3750
3750
|
RecognitionProvider2["INWORLD_STT"] = "inworld-stt";
|
|
3751
3751
|
RecognitionProvider2["AWS_TRANSCRIBE"] = "aws-transcribe";
|
|
3752
|
+
RecognitionProvider2["AMAZON_NOVA_SONIC"] = "amazon-nova-sonic";
|
|
3752
3753
|
RecognitionProvider2["TEST_ASR_PROVIDER_QUOTA"] = "test-asr-provider-quota";
|
|
3753
3754
|
RecognitionProvider2["TEST_ASR_STREAMING"] = "test-asr-streaming";
|
|
3754
3755
|
})(RecognitionProvider || (RecognitionProvider = {}));
|
|
@@ -3799,6 +3800,7 @@ var ElevenLabsModel;
|
|
|
3799
3800
|
})(ElevenLabsModel || (ElevenLabsModel = {}));
|
|
3800
3801
|
var OpenAIRealtimeModel;
|
|
3801
3802
|
(function(OpenAIRealtimeModel2) {
|
|
3803
|
+
OpenAIRealtimeModel2["GPT_REALTIME_WHISPER"] = "gpt-realtime-whisper";
|
|
3802
3804
|
OpenAIRealtimeModel2["GPT_4O_TRANSCRIBE"] = "gpt-4o-transcribe";
|
|
3803
3805
|
OpenAIRealtimeModel2["GPT_4O_MINI_TRANSCRIBE"] = "gpt-4o-mini-transcribe";
|
|
3804
3806
|
})(OpenAIRealtimeModel || (OpenAIRealtimeModel = {}));
|
|
@@ -3829,8 +3831,15 @@ var AwsTranscribeModel;
|
|
|
3829
3831
|
(function(AwsTranscribeModel2) {
|
|
3830
3832
|
AwsTranscribeModel2["DEFAULT"] = "default";
|
|
3831
3833
|
})(AwsTranscribeModel || (AwsTranscribeModel = {}));
|
|
3834
|
+
var AmazonNovaSonicModel;
|
|
3835
|
+
(function(AmazonNovaSonicModel2) {
|
|
3836
|
+
AmazonNovaSonicModel2["AMAZON_NOVA_SONIC_V1"] = "amazon.nova-sonic-v1:0";
|
|
3837
|
+
AmazonNovaSonicModel2["AMAZON_NOVA_2_SONIC"] = "amazon.nova-2-sonic-v1:0";
|
|
3838
|
+
})(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
|
|
3832
3839
|
var SelfServeVllmModel;
|
|
3833
3840
|
(function(SelfServeVllmModel2) {
|
|
3841
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
|
|
3842
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
|
|
3834
3843
|
SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
|
|
3835
3844
|
})(SelfServeVllmModel || (SelfServeVllmModel = {}));
|
|
3836
3845
|
|
|
@@ -3845,6 +3854,18 @@ var RecognitionResultTypeV1;
|
|
|
3845
3854
|
RecognitionResultTypeV12["AUDIO_METRICS"] = "AudioMetrics";
|
|
3846
3855
|
RecognitionResultTypeV12["SESSION_CONFIGURED"] = "SessionConfigured";
|
|
3847
3856
|
})(RecognitionResultTypeV1 || (RecognitionResultTypeV1 = {}));
|
|
3857
|
+
var DetectionTypeV1;
|
|
3858
|
+
(function(DetectionTypeV12) {
|
|
3859
|
+
DetectionTypeV12["SEARCH"] = "search";
|
|
3860
|
+
})(DetectionTypeV1 || (DetectionTypeV1 = {}));
|
|
3861
|
+
var DetectionV1Schema = z.object({
|
|
3862
|
+
type: z.nativeEnum(DetectionTypeV1),
|
|
3863
|
+
query: z.string(),
|
|
3864
|
+
score: z.number().min(0).max(1),
|
|
3865
|
+
startMs: z.number().optional(),
|
|
3866
|
+
endMs: z.number().optional()
|
|
3867
|
+
// Audio time (ms from stream start) where the hit ends
|
|
3868
|
+
});
|
|
3848
3869
|
var TranscriptionResultSchemaV1 = z.object({
|
|
3849
3870
|
type: z.literal(RecognitionResultTypeV1.TRANSCRIPTION),
|
|
3850
3871
|
audioUtteranceId: z.string(),
|
|
@@ -3863,8 +3884,9 @@ var TranscriptionResultSchemaV1 = z.object({
|
|
|
3863
3884
|
endTimestamp: z.number().optional(),
|
|
3864
3885
|
receivedAtMs: z.number().optional(),
|
|
3865
3886
|
accumulatedAudioTimeMs: z.number().optional(),
|
|
3866
|
-
rawAudioTimeMs: z.number().optional()
|
|
3867
|
-
|
|
3887
|
+
rawAudioTimeMs: z.number().optional(),
|
|
3888
|
+
detections: z.array(DetectionV1Schema).optional()
|
|
3889
|
+
// Provider-reported phrase detections (query + score, optionally startMs/endMs). Always populated when the provider returns hits, regardless of `appendSearch`. Other providers leave this undefined.
|
|
3868
3890
|
});
|
|
3869
3891
|
var FunctionCallResultSchemaV1 = z.object({
|
|
3870
3892
|
type: z.literal(RecognitionResultTypeV1.FUNCTION_CALL),
|
|
@@ -4117,7 +4139,15 @@ var TranscriptMessageSchema = z.object({
|
|
|
4117
4139
|
* @example true
|
|
4118
4140
|
* @default false
|
|
4119
4141
|
*/
|
|
4120
|
-
is_fallback: z.boolean().optional()
|
|
4142
|
+
is_fallback: z.boolean().optional(),
|
|
4143
|
+
/**
|
|
4144
|
+
* Provider-reported phrase detections (query + score, optionally
|
|
4145
|
+
* startMs/endMs). Always populated when the provider returns hits,
|
|
4146
|
+
* regardless of `appendSearch` or scene gating. Other providers leave
|
|
4147
|
+
* this undefined.
|
|
4148
|
+
* @example [{ query: 'justin bieber one time', score: 0.78, startMs: 1200, endMs: 2800 }]
|
|
4149
|
+
*/
|
|
4150
|
+
detections: z.array(DetectionV1Schema).optional()
|
|
4121
4151
|
});
|
|
4122
4152
|
var VADEndSignalSchema = z.object({
|
|
4123
4153
|
type: z.literal(ProviderMessageType.VAD_END_SIGNAL),
|
|
@@ -4434,6 +4464,9 @@ var ASRRequestSchemaV1 = z.object({
|
|
|
4434
4464
|
// Streaming audio metrics opt-in: when > 0, server emits AudioMetrics results throttled to this interval (ms).
|
|
4435
4465
|
// Undefined / 0 disables streaming audio metrics (final metrics still embedded in Metadata).
|
|
4436
4466
|
audioMetricsIntervalMs: z.number().optional(),
|
|
4467
|
+
// Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
|
|
4468
|
+
// Active only when (model = deepgram nova-2) AND (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch in asr-config.types.ts for full semantics.
|
|
4469
|
+
appendSearch: z.boolean().optional(),
|
|
4437
4470
|
// Debug options (FOR DEBUG/TESTING ONLY - not for production use)
|
|
4438
4471
|
debugCommand: RequestDebugCommandSchema
|
|
4439
4472
|
});
|
|
@@ -4588,6 +4621,21 @@ var AudioEncoding;
|
|
|
4588
4621
|
return NAME_TO_ENUM.has(nameStr.toUpperCase());
|
|
4589
4622
|
}
|
|
4590
4623
|
AudioEncoding2.isNameValid = isNameValid;
|
|
4624
|
+
function coerce2(value, onStringInput) {
|
|
4625
|
+
if (value === void 0) {
|
|
4626
|
+
return AudioEncoding2.LINEAR16;
|
|
4627
|
+
}
|
|
4628
|
+
if (typeof value === "number") {
|
|
4629
|
+
return value;
|
|
4630
|
+
}
|
|
4631
|
+
const result = fromName(value);
|
|
4632
|
+
if (result === void 0) {
|
|
4633
|
+
throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
|
|
4634
|
+
}
|
|
4635
|
+
onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
|
|
4636
|
+
return result;
|
|
4637
|
+
}
|
|
4638
|
+
AudioEncoding2.coerce = coerce2;
|
|
4591
4639
|
})(AudioEncoding || (AudioEncoding = {}));
|
|
4592
4640
|
var PREFIX_AUDIO_ENCODING_OFFSET = 128;
|
|
4593
4641
|
var SampleRate;
|
|
@@ -5313,6 +5361,37 @@ var MessageHandler = class {
|
|
|
5313
5361
|
}
|
|
5314
5362
|
};
|
|
5315
5363
|
|
|
5364
|
+
// src/utils/audio-resampler.ts
|
|
5365
|
+
function downsamplePcm16(input, srcRate, targetRate) {
|
|
5366
|
+
if (targetRate > srcRate) {
|
|
5367
|
+
throw new Error(
|
|
5368
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
|
|
5369
|
+
);
|
|
5370
|
+
}
|
|
5371
|
+
const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
|
|
5372
|
+
const src = new Int16Array(buffer);
|
|
5373
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
5374
|
+
return src.slice().buffer;
|
|
5375
|
+
}
|
|
5376
|
+
const ratio = srcRate / targetRate;
|
|
5377
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
5378
|
+
const dst = new Int16Array(dstLen);
|
|
5379
|
+
for (let i = 0; i < dstLen; i++) {
|
|
5380
|
+
const startPos = i * ratio;
|
|
5381
|
+
const endPos = (i + 1) * ratio;
|
|
5382
|
+
const startIdx = Math.floor(startPos);
|
|
5383
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
5384
|
+
let sum = 0;
|
|
5385
|
+
let count = 0;
|
|
5386
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
5387
|
+
sum += src[j] ?? 0;
|
|
5388
|
+
count++;
|
|
5389
|
+
}
|
|
5390
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
5391
|
+
}
|
|
5392
|
+
return dst.buffer;
|
|
5393
|
+
}
|
|
5394
|
+
|
|
5316
5395
|
// src/errors.ts
|
|
5317
5396
|
var RecognitionError = class extends Error {
|
|
5318
5397
|
constructor(errorType, message) {
|
|
@@ -5387,10 +5466,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5387
5466
|
const retryConfig = config.connectionRetry || {};
|
|
5388
5467
|
const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
|
|
5389
5468
|
const delayMs = retryConfig.delayMs ?? 200;
|
|
5469
|
+
const normalizedASRConfig = config.asrRequestConfig ? {
|
|
5470
|
+
...config.asrRequestConfig,
|
|
5471
|
+
encoding: AudioEncoding.coerce(
|
|
5472
|
+
config.asrRequestConfig.encoding,
|
|
5473
|
+
(warning) => config.logger?.("warn", warning)
|
|
5474
|
+
)
|
|
5475
|
+
} : void 0;
|
|
5390
5476
|
this.config = {
|
|
5391
5477
|
url,
|
|
5392
5478
|
audioUtteranceId,
|
|
5393
|
-
...
|
|
5479
|
+
...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
|
|
5394
5480
|
...config.gameContext && { gameContext: config.gameContext },
|
|
5395
5481
|
...config.callbackUrls && { callbackUrls: config.callbackUrls },
|
|
5396
5482
|
onTranscript: config.onTranscript || (() => {
|
|
@@ -5588,6 +5674,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5588
5674
|
}
|
|
5589
5675
|
this.sendAudioInternal(audioData);
|
|
5590
5676
|
}
|
|
5677
|
+
/**
|
|
5678
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
5679
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
5680
|
+
* before sending.
|
|
5681
|
+
*
|
|
5682
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
5683
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
5684
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
5685
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
5686
|
+
* `sendAudio()` to skip the resample step.
|
|
5687
|
+
*
|
|
5688
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
5689
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
5690
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
5691
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
5692
|
+
*
|
|
5693
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
5694
|
+
* mixed to mono by the caller.
|
|
5695
|
+
*
|
|
5696
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
5697
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
5698
|
+
*/
|
|
5699
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
5700
|
+
const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5701
|
+
if (audioData instanceof Blob) {
|
|
5702
|
+
blobToArrayBuffer(audioData).then((arrayBuffer) => {
|
|
5703
|
+
this.sendAudioInternal(
|
|
5704
|
+
downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
|
|
5705
|
+
);
|
|
5706
|
+
}).catch((error) => {
|
|
5707
|
+
this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
|
|
5708
|
+
});
|
|
5709
|
+
return;
|
|
5710
|
+
}
|
|
5711
|
+
this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
|
|
5712
|
+
}
|
|
5591
5713
|
sendAudioInternal(audioData) {
|
|
5592
5714
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5593
5715
|
if (bytes === 0) return;
|
|
@@ -5734,7 +5856,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5734
5856
|
model: this.config.asrRequestConfig.model,
|
|
5735
5857
|
language: this.config.asrRequestConfig.language?.toString() || "en",
|
|
5736
5858
|
sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
|
|
5737
|
-
encoding:
|
|
5859
|
+
encoding: this.config.asrRequestConfig.encoding,
|
|
5738
5860
|
interimResults: this.config.asrRequestConfig.interimResults ?? false,
|
|
5739
5861
|
// Auto-enable useContext if gameContext is provided, or use explicit value if set
|
|
5740
5862
|
useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
|
|
@@ -5759,6 +5881,12 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5759
5881
|
// Streaming audio metrics opt-in (ms interval). Server only forwards metrics if > 0.
|
|
5760
5882
|
...this.config.asrRequestConfig.audioMetricsIntervalMs !== void 0 && {
|
|
5761
5883
|
audioMetricsIntervalMs: this.config.asrRequestConfig.audioMetricsIntervalMs
|
|
5884
|
+
},
|
|
5885
|
+
// Opt-in: round-trip Deepgram nova-2 search-phrase hits into the
|
|
5886
|
+
// transcript. Only fires server-side when (model = nova-2) AND
|
|
5887
|
+
// (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch.
|
|
5888
|
+
...this.config.asrRequestConfig.appendSearch !== void 0 && {
|
|
5889
|
+
appendSearch: this.config.asrRequestConfig.appendSearch
|
|
5762
5890
|
}
|
|
5763
5891
|
};
|
|
5764
5892
|
super.sendMessage(
|
|
@@ -5892,7 +6020,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5892
6020
|
*/
|
|
5893
6021
|
sendAudioNow(audioData) {
|
|
5894
6022
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5895
|
-
const encodingId = this.config.asrRequestConfig?.encoding
|
|
6023
|
+
const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
5896
6024
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5897
6025
|
super.sendAudio(
|
|
5898
6026
|
audioData,
|
|
@@ -5960,7 +6088,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5960
6088
|
sendPrefixAudioNow(audioData) {
|
|
5961
6089
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5962
6090
|
if (byteLength === 0) return;
|
|
5963
|
-
const baseEncodingId = this.config.asrRequestConfig?.encoding
|
|
6091
|
+
const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
5964
6092
|
const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
|
|
5965
6093
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5966
6094
|
this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
|