@volley/recognition-client-sdk 0.1.782 → 0.1.799
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +60 -4
- package/dist/index.bundled.d.ts +75 -4
- package/dist/index.js +115 -13
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +95 -4
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +17 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +16 -1
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/package.json +3 -3
- package/src/index.spec.ts +2 -0
- package/src/recognition-client.ts +65 -7
- package/src/recognition-client.types.ts +21 -0
- package/src/simplified-vgf-recognition-client.ts +44 -17
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
|
@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
|
|
|
159
159
|
}
|
|
160
160
|
/**
|
|
161
161
|
* Self-serve vLLM batch transcription models
|
|
162
|
-
* Backed by recognition-inference / RunPod `/transcribe`
|
|
162
|
+
* Backed by recognition-inference / RunPod `/ws/transcribe`
|
|
163
163
|
*/
|
|
164
164
|
declare enum SelfServeVllmModel {
|
|
165
|
+
QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
|
|
166
|
+
QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
|
|
165
167
|
QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
|
|
166
168
|
}
|
|
167
169
|
/**
|
|
@@ -651,6 +653,20 @@ declare namespace AudioEncoding {
|
|
|
651
653
|
* @returns true if valid encoding name
|
|
652
654
|
*/
|
|
653
655
|
function isNameValid(nameStr: string): boolean;
|
|
656
|
+
/**
|
|
657
|
+
* Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
|
|
658
|
+
*
|
|
659
|
+
* - enum / number → returned as-is (already AudioEncoding-shaped)
|
|
660
|
+
* - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
|
|
661
|
+
* Invokes `onStringInput` with a warning message so callers can route it
|
|
662
|
+
* to their preferred logger.
|
|
663
|
+
* - invalid string → throws (preferred over silent fallback so typos surface)
|
|
664
|
+
* - undefined → defaults to {@link AudioEncoding.LINEAR16}
|
|
665
|
+
*
|
|
666
|
+
* Always normalize at the SDK / server boundary so downstream code can rely
|
|
667
|
+
* on a numeric AudioEncoding (the wire-level binary frame header is uint32).
|
|
668
|
+
*/
|
|
669
|
+
function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
|
|
654
670
|
}
|
|
655
671
|
/**
|
|
656
672
|
* Common sample rates (in Hz)
|
|
@@ -906,10 +922,10 @@ interface ASRRequestConfig {
|
|
|
906
922
|
* doesn't respond with is_final=true after stopRecording().
|
|
907
923
|
*
|
|
908
924
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
909
|
-
* - balanced: 500ms -
|
|
910
|
-
* - conservative: 1000ms - wait longer for complex utterances
|
|
925
|
+
* - balanced: 500ms - good for most cases
|
|
926
|
+
* - conservative: 1000ms - current default, wait longer for complex utterances
|
|
911
927
|
*
|
|
912
|
-
* @default '
|
|
928
|
+
* @default 'conservative'
|
|
913
929
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
914
930
|
*/
|
|
915
931
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
@@ -1404,6 +1420,23 @@ interface IRecognitionClient {
|
|
|
1404
1420
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
1405
1421
|
*/
|
|
1406
1422
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1423
|
+
/**
|
|
1424
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
1425
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
1426
|
+
* server validator) before transmitting.
|
|
1427
|
+
*
|
|
1428
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1429
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
1430
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
1431
|
+
* skip the resample step.
|
|
1432
|
+
*
|
|
1433
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1434
|
+
* mixed to mono by the caller.
|
|
1435
|
+
*
|
|
1436
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1437
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1438
|
+
*/
|
|
1439
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1407
1440
|
/**
|
|
1408
1441
|
* Stop recording and wait for final transcript
|
|
1409
1442
|
* The server will close the connection after sending the final transcript.
|
|
@@ -1603,6 +1636,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1603
1636
|
*/
|
|
1604
1637
|
private connectWithRetry;
|
|
1605
1638
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1639
|
+
/**
|
|
1640
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
1641
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
1642
|
+
* before sending.
|
|
1643
|
+
*
|
|
1644
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1645
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
1646
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
1647
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
1648
|
+
* `sendAudio()` to skip the resample step.
|
|
1649
|
+
*
|
|
1650
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
1651
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
1652
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
1653
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
1654
|
+
*
|
|
1655
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1656
|
+
* mixed to mono by the caller.
|
|
1657
|
+
*
|
|
1658
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1659
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1660
|
+
*/
|
|
1661
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1606
1662
|
private sendAudioInternal;
|
|
1607
1663
|
/**
|
|
1608
1664
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|
package/dist/index.bundled.d.ts
CHANGED
|
@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
|
|
|
159
159
|
}
|
|
160
160
|
/**
|
|
161
161
|
* Self-serve vLLM batch transcription models
|
|
162
|
-
* Backed by recognition-inference / RunPod `/transcribe`
|
|
162
|
+
* Backed by recognition-inference / RunPod `/ws/transcribe`
|
|
163
163
|
*/
|
|
164
164
|
declare enum SelfServeVllmModel {
|
|
165
|
+
QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
|
|
166
|
+
QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
|
|
165
167
|
QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
|
|
166
168
|
}
|
|
167
169
|
/**
|
|
@@ -660,6 +662,20 @@ declare namespace AudioEncoding {
|
|
|
660
662
|
* @returns true if valid encoding name
|
|
661
663
|
*/
|
|
662
664
|
function isNameValid(nameStr: string): boolean;
|
|
665
|
+
/**
|
|
666
|
+
* Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
|
|
667
|
+
*
|
|
668
|
+
* - enum / number → returned as-is (already AudioEncoding-shaped)
|
|
669
|
+
* - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
|
|
670
|
+
* Invokes `onStringInput` with a warning message so callers can route it
|
|
671
|
+
* to their preferred logger.
|
|
672
|
+
* - invalid string → throws (preferred over silent fallback so typos surface)
|
|
673
|
+
* - undefined → defaults to {@link AudioEncoding.LINEAR16}
|
|
674
|
+
*
|
|
675
|
+
* Always normalize at the SDK / server boundary so downstream code can rely
|
|
676
|
+
* on a numeric AudioEncoding (the wire-level binary frame header is uint32).
|
|
677
|
+
*/
|
|
678
|
+
function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
|
|
663
679
|
}
|
|
664
680
|
/**
|
|
665
681
|
* Common sample rates (in Hz)
|
|
@@ -1636,10 +1652,10 @@ interface ASRRequestConfig {
|
|
|
1636
1652
|
* doesn't respond with is_final=true after stopRecording().
|
|
1637
1653
|
*
|
|
1638
1654
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
1639
|
-
* - balanced: 500ms -
|
|
1640
|
-
* - conservative: 1000ms - wait longer for complex utterances
|
|
1655
|
+
* - balanced: 500ms - good for most cases
|
|
1656
|
+
* - conservative: 1000ms - current default, wait longer for complex utterances
|
|
1641
1657
|
*
|
|
1642
|
-
* @default '
|
|
1658
|
+
* @default 'conservative'
|
|
1643
1659
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
1644
1660
|
*/
|
|
1645
1661
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
@@ -2173,6 +2189,23 @@ interface IRecognitionClient {
|
|
|
2173
2189
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
2174
2190
|
*/
|
|
2175
2191
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2192
|
+
/**
|
|
2193
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
2194
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
2195
|
+
* server validator) before transmitting.
|
|
2196
|
+
*
|
|
2197
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
2198
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
2199
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
2200
|
+
* skip the resample step.
|
|
2201
|
+
*
|
|
2202
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
2203
|
+
* mixed to mono by the caller.
|
|
2204
|
+
*
|
|
2205
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
2206
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
2207
|
+
*/
|
|
2208
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2176
2209
|
/**
|
|
2177
2210
|
* Stop recording and wait for final transcript
|
|
2178
2211
|
* The server will close the connection after sending the final transcript.
|
|
@@ -2378,6 +2411,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
2378
2411
|
*/
|
|
2379
2412
|
private connectWithRetry;
|
|
2380
2413
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2414
|
+
/**
|
|
2415
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
2416
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
2417
|
+
* before sending.
|
|
2418
|
+
*
|
|
2419
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
2420
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
2421
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
2422
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
2423
|
+
* `sendAudio()` to skip the resample step.
|
|
2424
|
+
*
|
|
2425
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
2426
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
2427
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
2428
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
2429
|
+
*
|
|
2430
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
2431
|
+
* mixed to mono by the caller.
|
|
2432
|
+
*
|
|
2433
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
2434
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
2435
|
+
*/
|
|
2436
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2381
2437
|
private sendAudioInternal;
|
|
2382
2438
|
/**
|
|
2383
2439
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|
|
@@ -2801,6 +2857,14 @@ interface ISimplifiedVGFRecognitionClient {
|
|
|
2801
2857
|
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
2802
2858
|
*/
|
|
2803
2859
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2860
|
+
/**
|
|
2861
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
2862
|
+
* downsamples to the session's target rate before transmitting. Use
|
|
2863
|
+
* when capture is at the system's native rate (browser AudioContext is
|
|
2864
|
+
* typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
|
|
2865
|
+
* little-endian PCM, mono.
|
|
2866
|
+
*/
|
|
2867
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2804
2868
|
/**
|
|
2805
2869
|
* Stop recording and wait for final transcription
|
|
2806
2870
|
* @returns Promise that resolves when transcription is complete
|
|
@@ -2891,6 +2955,13 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
|
|
|
2891
2955
|
constructor(config: SimplifiedVGFClientConfig);
|
|
2892
2956
|
connect(): Promise<void>;
|
|
2893
2957
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2958
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2959
|
+
/**
|
|
2960
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
2961
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
2962
|
+
* `isRecordingAudio`.
|
|
2963
|
+
*/
|
|
2964
|
+
private markRecordingStarted;
|
|
2894
2965
|
stopRecording(): Promise<void>;
|
|
2895
2966
|
stopAbnormally(): void;
|
|
2896
2967
|
getAudioUtteranceId(): string;
|
package/dist/index.js
CHANGED
|
@@ -3838,6 +3838,8 @@ var AmazonNovaSonicModel;
|
|
|
3838
3838
|
})(AmazonNovaSonicModel || (AmazonNovaSonicModel = {}));
|
|
3839
3839
|
var SelfServeVllmModel;
|
|
3840
3840
|
(function(SelfServeVllmModel2) {
|
|
3841
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B"] = "qwen3-asr-0.6b";
|
|
3842
|
+
SelfServeVllmModel2["QWEN3_ASR_0_6B_WOF_LETTER"] = "qwen3-asr-0.6b-wof-letter";
|
|
3841
3843
|
SelfServeVllmModel2["QWEN3_ASR_1_7B"] = "qwen3-asr-1.7b";
|
|
3842
3844
|
})(SelfServeVllmModel || (SelfServeVllmModel = {}));
|
|
3843
3845
|
|
|
@@ -4642,6 +4644,21 @@ var AudioEncoding;
|
|
|
4642
4644
|
return NAME_TO_ENUM.has(nameStr.toUpperCase());
|
|
4643
4645
|
}
|
|
4644
4646
|
AudioEncoding2.isNameValid = isNameValid;
|
|
4647
|
+
function coerce2(value, onStringInput) {
|
|
4648
|
+
if (value === void 0) {
|
|
4649
|
+
return AudioEncoding2.LINEAR16;
|
|
4650
|
+
}
|
|
4651
|
+
if (typeof value === "number") {
|
|
4652
|
+
return value;
|
|
4653
|
+
}
|
|
4654
|
+
const result = fromName(value);
|
|
4655
|
+
if (result === void 0) {
|
|
4656
|
+
throw new Error(`Invalid encoding string: '${value}'. Use AudioEncoding enum or one of: LINEAR16, OGG_OPUS, FLAC, MULAW, ALAW (case insensitive)`);
|
|
4657
|
+
}
|
|
4658
|
+
onStringInput?.(`encoding passed as string '${value}'; prefer AudioEncoding.${toName(result)} enum for type safety`);
|
|
4659
|
+
return result;
|
|
4660
|
+
}
|
|
4661
|
+
AudioEncoding2.coerce = coerce2;
|
|
4645
4662
|
})(AudioEncoding || (AudioEncoding = {}));
|
|
4646
4663
|
var PREFIX_AUDIO_ENCODING_OFFSET = 128;
|
|
4647
4664
|
var SampleRate;
|
|
@@ -5412,6 +5429,37 @@ var MessageHandler = class {
|
|
|
5412
5429
|
}
|
|
5413
5430
|
};
|
|
5414
5431
|
|
|
5432
|
+
// src/utils/audio-resampler.ts
|
|
5433
|
+
function downsamplePcm16(input, srcRate, targetRate) {
|
|
5434
|
+
if (targetRate > srcRate) {
|
|
5435
|
+
throw new Error(
|
|
5436
|
+
`downsamplePcm16: cannot upsample from ${srcRate}Hz to ${targetRate}Hz; capture audio at \u2265 ${targetRate}Hz instead.`
|
|
5437
|
+
);
|
|
5438
|
+
}
|
|
5439
|
+
const buffer = ArrayBuffer.isView(input) ? input.buffer.slice(input.byteOffset, input.byteOffset + input.byteLength) : input;
|
|
5440
|
+
const src = new Int16Array(buffer);
|
|
5441
|
+
if (srcRate === targetRate || src.length === 0) {
|
|
5442
|
+
return src.slice().buffer;
|
|
5443
|
+
}
|
|
5444
|
+
const ratio = srcRate / targetRate;
|
|
5445
|
+
const dstLen = Math.floor(src.length / ratio);
|
|
5446
|
+
const dst = new Int16Array(dstLen);
|
|
5447
|
+
for (let i = 0; i < dstLen; i++) {
|
|
5448
|
+
const startPos = i * ratio;
|
|
5449
|
+
const endPos = (i + 1) * ratio;
|
|
5450
|
+
const startIdx = Math.floor(startPos);
|
|
5451
|
+
const endIdx = Math.min(Math.ceil(endPos), src.length);
|
|
5452
|
+
let sum = 0;
|
|
5453
|
+
let count = 0;
|
|
5454
|
+
for (let j = startIdx; j < endIdx; j++) {
|
|
5455
|
+
sum += src[j] ?? 0;
|
|
5456
|
+
count++;
|
|
5457
|
+
}
|
|
5458
|
+
dst[i] = count > 0 ? Math.round(sum / count) : 0;
|
|
5459
|
+
}
|
|
5460
|
+
return dst.buffer;
|
|
5461
|
+
}
|
|
5462
|
+
|
|
5415
5463
|
// src/errors.ts
|
|
5416
5464
|
var RecognitionError = class extends Error {
|
|
5417
5465
|
constructor(errorType, message) {
|
|
@@ -5512,10 +5560,17 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5512
5560
|
const retryConfig = config.connectionRetry || {};
|
|
5513
5561
|
const maxAttempts = Math.max(1, Math.min(5, retryConfig.maxAttempts ?? 4));
|
|
5514
5562
|
const delayMs = retryConfig.delayMs ?? 200;
|
|
5563
|
+
const normalizedASRConfig = config.asrRequestConfig ? {
|
|
5564
|
+
...config.asrRequestConfig,
|
|
5565
|
+
encoding: AudioEncoding.coerce(
|
|
5566
|
+
config.asrRequestConfig.encoding,
|
|
5567
|
+
(warning) => config.logger?.("warn", warning)
|
|
5568
|
+
)
|
|
5569
|
+
} : void 0;
|
|
5515
5570
|
this.config = {
|
|
5516
5571
|
url,
|
|
5517
5572
|
audioUtteranceId,
|
|
5518
|
-
...
|
|
5573
|
+
...normalizedASRConfig && { asrRequestConfig: normalizedASRConfig },
|
|
5519
5574
|
...config.gameContext && { gameContext: config.gameContext },
|
|
5520
5575
|
...config.callbackUrls && { callbackUrls: config.callbackUrls },
|
|
5521
5576
|
onTranscript: config.onTranscript || (() => {
|
|
@@ -5713,6 +5768,42 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5713
5768
|
}
|
|
5714
5769
|
this.sendAudioInternal(audioData);
|
|
5715
5770
|
}
|
|
5771
|
+
/**
|
|
5772
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
5773
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
5774
|
+
* before sending.
|
|
5775
|
+
*
|
|
5776
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
5777
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
5778
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
5779
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
5780
|
+
* `sendAudio()` to skip the resample step.
|
|
5781
|
+
*
|
|
5782
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
5783
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
5784
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
5785
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
5786
|
+
*
|
|
5787
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
5788
|
+
* mixed to mono by the caller.
|
|
5789
|
+
*
|
|
5790
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
5791
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
5792
|
+
*/
|
|
5793
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
5794
|
+
const targetRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
5795
|
+
if (audioData instanceof Blob) {
|
|
5796
|
+
blobToArrayBuffer(audioData).then((arrayBuffer) => {
|
|
5797
|
+
this.sendAudioInternal(
|
|
5798
|
+
downsamplePcm16(arrayBuffer, sourceSampleRate, targetRate)
|
|
5799
|
+
);
|
|
5800
|
+
}).catch((error) => {
|
|
5801
|
+
this.log("warn", "Failed to convert Blob to ArrayBuffer", error);
|
|
5802
|
+
});
|
|
5803
|
+
return;
|
|
5804
|
+
}
|
|
5805
|
+
this.sendAudioInternal(downsamplePcm16(audioData, sourceSampleRate, targetRate));
|
|
5806
|
+
}
|
|
5716
5807
|
sendAudioInternal(audioData) {
|
|
5717
5808
|
const bytes = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
5718
5809
|
if (bytes === 0) return;
|
|
@@ -5859,7 +5950,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
5859
5950
|
model: this.config.asrRequestConfig.model,
|
|
5860
5951
|
language: this.config.asrRequestConfig.language?.toString() || "en",
|
|
5861
5952
|
sampleRate: typeof this.config.asrRequestConfig.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000,
|
|
5862
|
-
encoding:
|
|
5953
|
+
encoding: this.config.asrRequestConfig.encoding,
|
|
5863
5954
|
interimResults: this.config.asrRequestConfig.interimResults ?? false,
|
|
5864
5955
|
// Auto-enable useContext if gameContext is provided, or use explicit value if set
|
|
5865
5956
|
useContext: this.config.asrRequestConfig.useContext ?? !!this.config.gameContext,
|
|
@@ -6023,7 +6114,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
6023
6114
|
*/
|
|
6024
6115
|
sendAudioNow(audioData) {
|
|
6025
6116
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
6026
|
-
const encodingId = this.config.asrRequestConfig?.encoding
|
|
6117
|
+
const encodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
6027
6118
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
6028
6119
|
super.sendAudio(
|
|
6029
6120
|
audioData,
|
|
@@ -6091,7 +6182,7 @@ var RealTimeTwoWayWebSocketRecognitionClient = class _RealTimeTwoWayWebSocketRec
|
|
|
6091
6182
|
sendPrefixAudioNow(audioData) {
|
|
6092
6183
|
const byteLength = ArrayBuffer.isView(audioData) ? audioData.byteLength : audioData.byteLength;
|
|
6093
6184
|
if (byteLength === 0) return;
|
|
6094
|
-
const baseEncodingId = this.config.asrRequestConfig?.encoding
|
|
6185
|
+
const baseEncodingId = this.config.asrRequestConfig?.encoding ?? AudioEncoding.LINEAR16;
|
|
6095
6186
|
const prefixEncodingId = baseEncodingId + PREFIX_AUDIO_ENCODING_OFFSET;
|
|
6096
6187
|
const sampleRate = typeof this.config.asrRequestConfig?.sampleRate === "number" ? this.config.asrRequestConfig.sampleRate : SampleRate.RATE_16000;
|
|
6097
6188
|
this.log("debug", "Sending prefix audio", { bytes: byteLength, encoding: prefixEncodingId });
|
|
@@ -6619,17 +6710,28 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6619
6710
|
await this.client.connect();
|
|
6620
6711
|
}
|
|
6621
6712
|
sendAudio(audioData) {
|
|
6622
|
-
|
|
6623
|
-
this.isRecordingAudio = true;
|
|
6624
|
-
this.state = {
|
|
6625
|
-
...this.state,
|
|
6626
|
-
startRecordingStatus: "RECORDING",
|
|
6627
|
-
startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
6628
|
-
};
|
|
6629
|
-
this.notifyStateChange();
|
|
6630
|
-
}
|
|
6713
|
+
this.markRecordingStarted();
|
|
6631
6714
|
this.client.sendAudio(audioData);
|
|
6632
6715
|
}
|
|
6716
|
+
sendAudioWithSampleRate(audioData, sourceSampleRate) {
|
|
6717
|
+
this.markRecordingStarted();
|
|
6718
|
+
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
6719
|
+
}
|
|
6720
|
+
/**
|
|
6721
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
6722
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
6723
|
+
* `isRecordingAudio`.
|
|
6724
|
+
*/
|
|
6725
|
+
markRecordingStarted() {
|
|
6726
|
+
if (this.isRecordingAudio) return;
|
|
6727
|
+
this.isRecordingAudio = true;
|
|
6728
|
+
this.state = {
|
|
6729
|
+
...this.state,
|
|
6730
|
+
startRecordingStatus: "RECORDING",
|
|
6731
|
+
startRecordingTimestamp: (/* @__PURE__ */ new Date()).toISOString()
|
|
6732
|
+
};
|
|
6733
|
+
this.notifyStateChange();
|
|
6734
|
+
}
|
|
6633
6735
|
async stopRecording() {
|
|
6634
6736
|
this.isRecordingAudio = false;
|
|
6635
6737
|
this.state = updateStateOnStop(this.state);
|