@volley/recognition-client-sdk 0.1.767 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/dist/browser.bundled.d.ts +256 -123
- package/dist/index.bundled.d.ts +279 -125
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +156 -16
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +135 -7
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +1 -1
- package/src/index.spec.ts +2 -0
- package/src/index.ts +1 -0
- package/src/recognition-client.ts +71 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
package/dist/index.js
CHANGED
|
@@ -3749,6 +3749,7 @@ var RecognitionProvider;
|
|
|
3749
3749
|
RecognitionProvider2["BEDROCK"] = "bedrock";
|
|
3750
3750
|
RecognitionProvider2["INWORLD_STT"] = "inworld-stt";
|
|
3751
3751
|
RecognitionProvider2["AWS_TRANSCRIBE"] = "aws-transcribe";
|
|
3752
|
+
RecognitionProvider2["AMAZON_NOVA_SONIC"] = "amazon-nova-sonic";
|
|
3752
3753
|
RecognitionProvider2["TEST_ASR_PROVIDER_QUOTA"] = "test-asr-provider-quota";
|
|
3753
3754
|
RecognitionProvider2["TEST_ASR_STREAMING"] = "test-asr-streaming";
|
|
3754
3755
|
})(RecognitionProvider || (RecognitionProvider = {}));
|
|
@@ -3799,6 +3800,7 @@ var ElevenLabsModel;
|
|
|
3799
3800
|
})(ElevenLabsModel || (ElevenLabsModel = {}));
|
|
3800
3801
|
var OpenAIRealtimeModel;
|
|
3801
3802
|
(function(OpenAIRealtimeModel2) {
|
|
3803
|
+
OpenAIRealtimeModel2["GPT_REALTIME_WHISPER"] = "gpt-realtime-whisper";
|
|
3802
3804
|
OpenAIRealtimeModel2["GPT_4O_TRANSCRIBE"] = "gpt-4o-transcribe";
|
|
3803
3805
|
OpenAIRealtimeModel2["GPT_4O_MINI_TRANSCRIBE"] = "gpt-4o-mini-transcribe";
|
|
3804
3806
|
})(OpenAIRealtimeModel || (OpenAIRealtimeModel = {}));
|
|
@@ -3829,8 +3831,15 @@ var AwsTranscribeModel;
|
|
|
3829
3831
|
(function(AwsTranscribeModel2) {
|
|
3830
3832
|
AwsTranscribeModel2["DEFAULT"] = "default";
|
|
3831
3833
|
})(AwsTranscribeModel || (AwsTranscribeModel = {}));
|
|
3834
|
+
var AmazonNovaSonicModel;
|
|
3835
|
+
(function(AmazonNovaSonicModel2) {
|
|
3836
|
+
AmazonNovaSonicModel2["AMAZON_NOVA_SONIC_V1"] = "amazon.nova-sonic-v1:0";
|
|
3837
|
+
AmazonNovaSonicModel2["AMAZON_NOVA_2_SONIC"] = "amazon.nova-2-sonic-v1:0";
|
|
3838
|
+
})(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
|
|
3832
3839
|
var SelfServeVllmModel;
|
|
3833
3840
|
(function(SelfServeVllmModel2) {
|
|
3841
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
|
|
3842
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
|
|
3834
3843
|
SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
|
|
3835
3844
|
})(SelfServeVllmModel || (SelfServeVllmModel = {}));
|
|
3836
3845
|
|
|
@@ -3845,6 +3854,18 @@ var RecognitionResultTypeV1;
|
|
|
3845
3854
|
RecognitionResultTypeV12["AUDIO_METRICS"] = "AudioMetrics";
|
|
3846
3855
|
RecognitionResultTypeV12["SESSION_CONFIGURED"] = "SessionConfigured";
|
|
3847
3856
|
})(RecognitionResultTypeV1 || (RecognitionResultTypeV1 = {}));
|
|
3857
|
+
var DetectionTypeV1;
|
|
3858
|
+
(function(DetectionTypeV12) {
|
|
3859
|
+
DetectionTypeV12["SEARCH"] = "search";
|
|
3860
|
+
})(DetectionTypeV1 || (DetectionTypeV1 = {}));
|
|
3861
|
+
var DetectionV1Schema = z.object({
|
|
3862
|
+
type: z.nativeEnum(DetectionTypeV1),
|
|
3863
|
+
query: z.string(),
|
|
3864
|
+
score: z.number().min(0).max(1),
|
|
3865
|
+
startMs: z.number().optional(),
|
|
3866
|
+
endMs: z.number().optional()
|
|
3867
|
+
// Audio time (ms from stream start) where the hit ends
|
|
3868
|
+
});
|
|
3848
3869
|
var TranscriptionResultSchemaV1 = z.object({
|
|
3849
3870
|
type: z.literal(RecognitionResultTypeV1.TRANSCRIPTION),
|
|
3850
3871
|
audioUtteranceId: z.string(),
|
|
@@ -3863,8 +3884,9 @@ var TranscriptionResultSchemaV1 = z.object({
|
|
|
3863
3884
|
endTimestamp: z.number().optional(),
|
|
3864
3885
|
receivedAtMs: z.number().optional(),
|
|
3865
3886
|
accumulatedAudioTimeMs: z.number().optional(),
|
|
3866
|
-
rawAudioTimeMs: z.number().optional()
|
|
3867
|
-
|
|
3887
|
+
rawAudioTimeMs: z.number().optional(),
|
|
3888
|
+
detections: z.array(DetectionV1Schema).optional()
|
|
3889
|
+
// Provider-reported phrase detections (query + score, optionally startMs/endMs). Always populated when the provider returns hits, regardless of `appendSearch`. Other providers leave this undefined.
|
|
3868
3890
|
});
|
|
3869
3891
|
var FunctionCallResultSchemaV1 = z.object({
|
|
3870
3892
|
type: z.literal(RecognitionResultTypeV1.FUNCTION_CALL),
|
|
@@ -4117,7 +4139,15 @@ var TranscriptMessageSchema = z.object({
|
|
|
4117
4139
|
* @example true
|
|
4118
4140
|
* @default false
|
|
4119
4141
|
*/
|
|
4120
|
-
is_fallback: z.boolean().optional()
|
|
4142
|
+
is_fallback: z.boolean().optional(),
|
|
4143
|
+
/**
|
|
4144
|
+
* Provider-reported phrase detections (query + score, optionally
|
|
4145
|
+
* startMs/endMs). Always populated when the provider returns hits,
|
|
4146
|
+
* regardless of `appendSearch` or scene gating. Other providers leave
|
|
4147
|
+
* this undefined.
|
|
4148
|
+
* @example [{ query: 'justin bieber one time', score: 0.78, startMs: 1200, endMs: 2800 }]
|
|
4149
|
+
*/
|
|
4150
|
+
detections: z.array(DetectionV1Schema).optional()
|
|
4121
4151
|
});
|
|
4122
4152
|
var VADEndSignalSchema = z.object({
|
|
4123
4153
|
type: z.literal(ProviderMessageType.VAD_END_SIGNAL),
|
|
@@ -4457,6 +4487,9 @@ var ASRRequestSchemaV1 = z.object({
|
|
|
4457
4487
|
// Streaming audio metrics opt-in: when > 0, server emits AudioMetrics results throttled to this interval (ms).
|
|
4458
4488
|
// Undefined / 0 disables streaming audio metrics (final metrics still embedded in Metadata).
|
|
4459
4489
|
audioMetricsIntervalMs: z.number().optional(),
|
|
4490
|
+
// Opt-in: round-trip Deepgram `search` phrase hits into the transcript.
|
|
4491
|
+
// Active only when (model = deepgram nova-2) AND (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch in asr-config.types.ts for full semantics.
|
|
4492
|
+
appendSearch: z.boolean().optional(),
|
|
4460
4493
|
// Debug options (FOR DEBUG/TESTING ONLY - not for production use)
|
|
4461
4494
|
debugCommand: RequestDebugCommandSchema
|
|
4462
4495
|
});
|
|
@@ -4611,6 +4644,21 @@ var AudioEncoding;
|
|
|
4611
4644
|
return NAME_TO_ENUM.has(nameStr.toUpperCase());
|
|
4612
4645
|
}
|
|
4613
4646
|
AudioEncoding2.isNameValid = isNameValid;
|
|
4647
|
+
function coerce2(value, onStringInput) {
|
|
4648
|
+
if (value === void 0) {
|
|
4649
|
+
return AudioEncoding2.LINEAR16;
|
|
4650
|
+
}
|
|
4651
|
+
if (typeof value === "number") {
|
|
4652
|
+
return value;
|
|
4653
|
+
}
|
|
4654
|
+
const result = fromName(value);
|
|
4655
|
+
if (result === void 0) {
|
|
4656
|
+
throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
|
|
4657
|
+
}
|
|
4658
|
+
onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
|
|
4659
|
+
return result;
|
|
4660
|
+
}
|
|
4661
|
+
AudioEncoding2.coerce = coerce2;
|
|
4614
4662
|
})(AudioEncoding || (AudioEncoding = {}));
|
|
4615
4663
|
var PREFIX_AUDIO_ENCODING_OFFSET = 128;
|
|
4616
4664
|
var SampleRate;
|
|
@@ -5381,6 +5429,37 @@ var MessageHandler = class {
|
|
|
5381
5429
|
}
|
|
5382
5430
|
};
|
|
5383
5431
|
|
|
5432
|
+
// src/utils/audio-resampler.ts
|
|
5433
|
+
function downsamplePcm16(input, srcRate, targetRate) {
|
|
5434
|
+
if (targetRate > srcRate) {
|
|
5435
|
+
throw new Error(
|
|
5436
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
|
|
5437
|
+
);
|
|
5438
|
+
}
|
|
5439
|
+
const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
|
|
5440
|
+
const src = new Int16Array(buffer);
|
|
5441
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
5442
|
+
return src.slice().buffer;
|
|
5443
|
+
}
|
|
5444
|
+
const ratio = srcRate / targetRate;
|
|
5445
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
5446
|
+
const dst = new Int16Array(dstLen);
|
|
5447
|
+
for (let i = 0; i < dstLen; i++) {
|
|
5448
|
+
const startPos = i * ratio;
|
|
5449
|
+
const endPos = (i + 1) * ratio;
|
|
5450
|
+
const startIdx = Math.floor(startPos);
|
|
5451
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
5452
|
+
let sum = 0;
|
|
5453
|
+
let count = 0;
|
|
5454
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
5455
|
+
sum += src[j] ?? 0;
|
|
5456
|
+
count++;
|
|
5457
|
+
}
|
|
5458
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
5459
|
+
}
|
|
5460
|
+
return dst.buffer;
|
|
5461
|
+
}
|
|
5462
|
+
|
|
5384
5463
|
// src/errors.ts
|
|
5385
5464
|
var RecognitionError = class extends Error {
|
|
5386
5465
|
constructor(errorType, message) {
|
|
@@ -5481,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5481
5560
|
const retryConfig = config.connectionRetry || {};
|
|
5482
5561
|
const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
|
|
5483
5562
|
const delayMs = retryConfig.delayMs ?? 200;
|
|
5563
|
+
const normalizedASRConfig = config.asrRequestConfig ? {
|
|
5564
|
+
...config.asrRequestConfig,
|
|
5565
|
+
encoding: AudioEncoding.coerce(
|
|
5566
|
+
config.asrRequestConfig.encoding,
|
|
5567
|
+
(warning) => config.logger?.("warn", warning)
|
|
5568
|
+
)
|
|
5569
|
+
} : void 0;
|
|
5484
5570
|
this.config = {
|
|
5485
5571
|
url,
|
|
5486
5572
|
audioUtteranceId,
|
|
5487
|
-
...
|
|
5573
|
+
...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
|
|
5488
5574
|
...config.gameContext && { gameContext: config.gameContext },
|
|
5489
5575
|
...config.callbackUrls && { callbackUrls: config.callbackUrls },
|
|
5490
5576
|
onTranscript: config.onTranscript || (() => {
|
|
@@ -5682,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5682
5768
|
}
|
|
5683
5769
|
this.sendAudioInternal(audioData);
|
|
5684
5770
|
}
|
|
5771
|
+
/**
|
|
5772
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
5773
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
5774
|
+
* before sending.
|
|
5775
|
+
*
|
|
5776
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
5777
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
5778
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
5779
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
5780
|
+
* `sendAudio()` to skip the resample step.
|
|
5781
|
+
*
|
|
5782
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
5783
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
5784
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
5785
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
5786
|
+
*
|
|
5787
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
5788
|
+
* mixed to mono by the caller.
|
|
5789
|
+
*
|
|
5790
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
5791
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
5792
|
+
*/
|
|
5793
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
5794
|
+
const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5795
|
+
if (audioData instanceof Blob) {
|
|
5796
|
+
blobToArrayBuffer(audioData).then((arrayBuffer) => {
|
|
5797
|
+
this.sendAudioInternal(
|
|
5798
|
+
downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
|
|
5799
|
+
);
|
|
5800
|
+
}).catch((error) => {
|
|
5801
|
+
this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
|
|
5802
|
+
});
|
|
5803
|
+
return;
|
|
5804
|
+
}
|
|
5805
|
+
this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
|
|
5806
|
+
}
|
|
5685
5807
|
sendAudioInternal(audioData) {
|
|
5686
5808
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5687
5809
|
if (bytes === 0) return;
|
|
@@ -5828,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5828
5950
|
model: this.config.asrRequestConfig.model,
|
|
5829
5951
|
language: this.config.asrRequestConfig.language?.toString() || "en",
|
|
5830
5952
|
sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
|
|
5831
|
-
encoding:
|
|
5953
|
+
encoding: this.config.asrRequestConfig.encoding,
|
|
5832
5954
|
interimResults: this.config.asrRequestConfig.interimResults ?? false,
|
|
5833
5955
|
// Auto-enable useContext if gameContext is provided, or use explicit value if set
|
|
5834
5956
|
useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
|
|
@@ -5853,6 +5975,12 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5853
5975
|
// Streaming audio metrics opt-in (ms interval). Server only forwards metrics if > 0.
|
|
5854
5976
|
...this.config.asrRequestConfig.audioMetricsIntervalMs !== void 0 && {
|
|
5855
5977
|
audioMetricsIntervalMs: this.config.asrRequestConfig.audioMetricsIntervalMs
|
|
5978
|
+
},
|
|
5979
|
+
// Opt-in: round-trip Deepgram nova-2 search-phrase hits into the
|
|
5980
|
+
// transcript. Only fires server-side when (model = nova-2) AND
|
|
5981
|
+
// (GameContext.gamePhase = 'Solve Puzzle'). See ASRRequestConfig.appendSearch.
|
|
5982
|
+
...this.config.asrRequestConfig.appendSearch !== void 0 && {
|
|
5983
|
+
appendSearch: this.config.asrRequestConfig.appendSearch
|
|
5856
5984
|
}
|
|
5857
5985
|
};
|
|
5858
5986
|
super.sendMessage(
|
|
@@ -5986,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5986
6114
|
*/
|
|
5987
6115
|
sendAudioNow(audioData) {
|
|
5988
6116
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5989
|
-
const encodingId = this.config.asrRequestConfig?.encoding
|
|
6117
|
+
const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
5990
6118
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5991
6119
|
super.sendAudio(
|
|
5992
6120
|
audioData,
|
|
@@ -6054,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
6054
6182
|
sendPrefixAudioNow(audioData) {
|
|
6055
6183
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
6056
6184
|
if (byteLength === 0) return;
|
|
6057
|
-
const baseEncodingId = this.config.asrRequestConfig?.encoding
|
|
6185
|
+
const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
6058
6186
|
const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
|
|
6059
6187
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
6060
6188
|
this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
|
|
@@ -6582,17 +6710,28 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6582
6710
|
await this.client.connect();
|
|
6583
6711
|
}
|
|
6584
6712
|
sendAudio(audioData) {
|
|
6585
|
-
|
|
6586
|
-
this.isRecordingAudio = true;
|
|
6587
|
-
this.state = {
|
|
6588
|
-
...this.state,
|
|
6589
|
-
startRecordingStatus: "RECORDING",
|
|
6590
|
-
startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
6591
|
-
};
|
|
6592
|
-
this.notifyStateChange();
|
|
6593
|
-
}
|
|
6713
|
+
this.markRecordingStarted();
|
|
6594
6714
|
this.client.sendAudio(audioData);
|
|
6595
6715
|
}
|
|
6716
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
6717
|
+
this.markRecordingStarted();
|
|
6718
|
+
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
6719
|
+
}
|
|
6720
|
+
/**
|
|
6721
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
6722
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
6723
|
+
* `isRecordingAudio`.
|
|
6724
|
+
*/
|
|
6725
|
+
markRecordingStarted() {
|
|
6726
|
+
if (this.isRecordingAudio) return;
|
|
6727
|
+
this.isRecordingAudio = true;
|
|
6728
|
+
this.state = {
|
|
6729
|
+
...this.state,
|
|
6730
|
+
startRecordingStatus: "RECORDING",
|
|
6731
|
+
startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
6732
|
+
};
|
|
6733
|
+
this.notifyStateChange();
|
|
6734
|
+
}
|
|
6596
6735
|
async stopRecording() {
|
|
6597
6736
|
this.isRecordingAudio = false;
|
|
6598
6737
|
this.state = updateStateOnStop(this.state);
|
|
@@ -6704,6 +6843,7 @@ function createSimplifiedVGFClient(config) {
|
|
|
6704
6843
|
return new SimplifiedVGFRecognitionClient(config);
|
|
6705
6844
|
}
|
|
6706
6845
|
export {
|
|
6846
|
+
AmazonNovaSonicModel,
|
|
6707
6847
|
AudioEncoding,
|
|
6708
6848
|
AwsTranscribeModel,
|
|
6709
6849
|
BedrockModel,
|