@volley/recognition-client-sdk 0.1.782 → 0.1.800
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +75 -4
- package/dist/index.bundled.d.ts +198 -87
- package/dist/index.js +191 -20
- package/dist/index.js.map +4 -4
- package/dist/recog-client-sdk.browser.js +95 -4
- package/dist/recog-client-sdk.browser.js.map +4 -4
- package/dist/recognition-client.d.ts +23 -0
- package/dist/recognition-client.d.ts.map +1 -1
- package/dist/recognition-client.types.d.ts +32 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +22 -85
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/utils/audio-resampler.d.ts +32 -0
- package/dist/utils/audio-resampler.d.ts.map +1 -0
- package/dist/vgf-recognition-mapper.d.ts +9 -17
- package/dist/vgf-recognition-mapper.d.ts.map +1 -1
- package/dist/vgf-recognition-state.d.ts +103 -0
- package/dist/vgf-recognition-state.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/index.spec.ts +2 -0
- package/src/recognition-client.ts +65 -7
- package/src/recognition-client.types.ts +37 -0
- package/src/simplified-vgf-recognition-client.spec.ts +0 -27
- package/src/simplified-vgf-recognition-client.ts +97 -127
- package/src/utils/audio-resampler.spec.ts +69 -0
- package/src/utils/audio-resampler.ts +79 -0
- package/src/vgf-recognition-mapper.spec.ts +143 -0
- package/src/vgf-recognition-mapper.ts +35 -45
- package/src/vgf-recognition-state.ts +19 -1
|
@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
|
|
|
159
159
|
}
|
|
160
160
|
/**
|
|
161
161
|
* Self-serve vLLM batch transcription models
|
|
162
|
-
* Backed by recognition-inference / RunPod `/transcribe`
|
|
162
|
+
* Backed by recognition-inference / RunPod `/ws/transcribe`
|
|
163
163
|
*/
|
|
164
164
|
declare enum SelfServeVllmModel {
|
|
165
|
+
QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
|
|
166
|
+
QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
|
|
165
167
|
QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
|
|
166
168
|
}
|
|
167
169
|
/**
|
|
@@ -651,6 +653,20 @@ declare namespace AudioEncoding {
|
|
|
651
653
|
* @returns true if valid encoding name
|
|
652
654
|
*/
|
|
653
655
|
function isNameValid(nameStr: string): boolean;
|
|
656
|
+
/**
|
|
657
|
+
* Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
|
|
658
|
+
*
|
|
659
|
+
* - enum / number → returned as-is (already AudioEncoding-shaped)
|
|
660
|
+
* - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
|
|
661
|
+
* Invokes `onStringInput` with a warning message so callers can route it
|
|
662
|
+
* to their preferred logger.
|
|
663
|
+
* - invalid string → throws (preferred over silent fallback so typos surface)
|
|
664
|
+
* - undefined → defaults to {@link AudioEncoding.LINEAR16}
|
|
665
|
+
*
|
|
666
|
+
* Always normalize at the SDK / server boundary so downstream code can rely
|
|
667
|
+
* on a numeric AudioEncoding (the wire-level binary frame header is uint32).
|
|
668
|
+
*/
|
|
669
|
+
function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
|
|
654
670
|
}
|
|
655
671
|
/**
|
|
656
672
|
* Common sample rates (in Hz)
|
|
@@ -906,10 +922,10 @@ interface ASRRequestConfig {
|
|
|
906
922
|
* doesn't respond with is_final=true after stopRecording().
|
|
907
923
|
*
|
|
908
924
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
909
|
-
* - balanced: 500ms -
|
|
910
|
-
* - conservative: 1000ms - wait longer for complex utterances
|
|
925
|
+
* - balanced: 500ms - good for most cases
|
|
926
|
+
* - conservative: 1000ms - current default, wait longer for complex utterances
|
|
911
927
|
*
|
|
912
|
-
* @default '
|
|
928
|
+
* @default 'conservative'
|
|
913
929
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
914
930
|
*/
|
|
915
931
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
@@ -1390,6 +1406,11 @@ interface IRecognitionClientConfig {
|
|
|
1390
1406
|
*
|
|
1391
1407
|
* Main interface for real-time speech recognition clients.
|
|
1392
1408
|
* Provides methods for connection management, audio streaming, and session control.
|
|
1409
|
+
*
|
|
1410
|
+
* NOTE for maintainers: `ISimplifiedVGFRecognitionClient` extends this interface,
|
|
1411
|
+
* so any method added here must also be implemented (typically as a delegate) by
|
|
1412
|
+
* `SimplifiedVGFRecognitionClient`. TypeScript will flag missing delegates at
|
|
1413
|
+
* compile time — do not work around the error, add the delegate.
|
|
1393
1414
|
*/
|
|
1394
1415
|
interface IRecognitionClient {
|
|
1395
1416
|
/**
|
|
@@ -1404,6 +1425,33 @@ interface IRecognitionClient {
|
|
|
1404
1425
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
1405
1426
|
*/
|
|
1406
1427
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1428
|
+
/**
|
|
1429
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
1430
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
1431
|
+
* server validator) before transmitting.
|
|
1432
|
+
*
|
|
1433
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1434
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
1435
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
1436
|
+
* skip the resample step.
|
|
1437
|
+
*
|
|
1438
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1439
|
+
* mixed to mono by the caller.
|
|
1440
|
+
*
|
|
1441
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1442
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1443
|
+
*/
|
|
1444
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1445
|
+
/**
|
|
1446
|
+
* Send prefix audio (e.g. a TTS prompt) that primes the provider's language
|
|
1447
|
+
* model before user audio is streamed. Chunks accepted — the server buffers
|
|
1448
|
+
* until the session is READY and flushes. Must be sent BEFORE the first
|
|
1449
|
+
* `sendAudio()` to take effect. Only meaningful when
|
|
1450
|
+
* `asrRequestConfig.prefixMode === PrefixMode.CLIENT`.
|
|
1451
|
+
*
|
|
1452
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
1453
|
+
*/
|
|
1454
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1407
1455
|
/**
|
|
1408
1456
|
* Stop recording and wait for final transcript
|
|
1409
1457
|
* The server will close the connection after sending the final transcript.
|
|
@@ -1603,6 +1651,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
1603
1651
|
*/
|
|
1604
1652
|
private connectWithRetry;
|
|
1605
1653
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1654
|
+
/**
|
|
1655
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
1656
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
1657
|
+
* before sending.
|
|
1658
|
+
*
|
|
1659
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
1660
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
1661
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
1662
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
1663
|
+
* `sendAudio()` to skip the resample step.
|
|
1664
|
+
*
|
|
1665
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
1666
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
1667
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
1668
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
1669
|
+
*
|
|
1670
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
1671
|
+
* mixed to mono by the caller.
|
|
1672
|
+
*
|
|
1673
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
1674
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1675
|
+
*/
|
|
1676
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1606
1677
|
private sendAudioInternal;
|
|
1607
1678
|
/**
|
|
1608
1679
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|
package/dist/index.bundled.d.ts
CHANGED
|
@@ -159,9 +159,11 @@ declare enum AmazonNovaSonicModel {
|
|
|
159
159
|
}
|
|
160
160
|
/**
|
|
161
161
|
* Self-serve vLLM batch transcription models
|
|
162
|
-
* Backed by recognition-inference / RunPod `/transcribe`
|
|
162
|
+
* Backed by recognition-inference / RunPod `/ws/transcribe`
|
|
163
163
|
*/
|
|
164
164
|
declare enum SelfServeVllmModel {
|
|
165
|
+
QWEN3_ASR_0_6B = "qwen3-asr-0.6b",
|
|
166
|
+
QWEN3_ASR_0_6B_WOF_LETTER = "qwen3-asr-0.6b-wof-letter",
|
|
165
167
|
QWEN3_ASR_1_7B = "qwen3-asr-1.7b"
|
|
166
168
|
}
|
|
167
169
|
/**
|
|
@@ -660,6 +662,20 @@ declare namespace AudioEncoding {
|
|
|
660
662
|
* @returns true if valid encoding name
|
|
661
663
|
*/
|
|
662
664
|
function isNameValid(nameStr: string): boolean;
|
|
665
|
+
/**
|
|
666
|
+
* Coerce a possibly-stringly-typed encoding value into the AudioEncoding enum.
|
|
667
|
+
*
|
|
668
|
+
* - enum / number → returned as-is (already AudioEncoding-shaped)
|
|
669
|
+
* - string (case-insensitive, e.g. 'linear16', 'LINEAR16') → converted via {@link fromName}.
|
|
670
|
+
* Invokes `onStringInput` with a warning message so callers can route it
|
|
671
|
+
* to their preferred logger.
|
|
672
|
+
* - invalid string → throws (preferred over silent fallback so typos surface)
|
|
673
|
+
* - undefined → defaults to {@link AudioEncoding.LINEAR16}
|
|
674
|
+
*
|
|
675
|
+
* Always normalize at the SDK / server boundary so downstream code can rely
|
|
676
|
+
* on a numeric AudioEncoding (the wire-level binary frame header is uint32).
|
|
677
|
+
*/
|
|
678
|
+
function coerce(value: AudioEncoding | string | number | undefined, onStringInput?: (warning: string) => void): AudioEncoding;
|
|
663
679
|
}
|
|
664
680
|
/**
|
|
665
681
|
* Common sample rates (in Hz)
|
|
@@ -1636,10 +1652,10 @@ interface ASRRequestConfig {
|
|
|
1636
1652
|
* doesn't respond with is_final=true after stopRecording().
|
|
1637
1653
|
*
|
|
1638
1654
|
* - aggressive: 100ms - fast response, may cut off slow providers
|
|
1639
|
-
* - balanced: 500ms -
|
|
1640
|
-
* - conservative: 1000ms - wait longer for complex utterances
|
|
1655
|
+
* - balanced: 500ms - good for most cases
|
|
1656
|
+
* - conservative: 1000ms - current default, wait longer for complex utterances
|
|
1641
1657
|
*
|
|
1642
|
-
* @default '
|
|
1658
|
+
* @default 'conservative'
|
|
1643
1659
|
* @see FinalTranscriptStability enum for detailed descriptions
|
|
1644
1660
|
*/
|
|
1645
1661
|
finalTranscriptStability?: FinalTranscriptStability | string;
|
|
@@ -2159,6 +2175,11 @@ interface IRecognitionClientConfig {
|
|
|
2159
2175
|
*
|
|
2160
2176
|
* Main interface for real-time speech recognition clients.
|
|
2161
2177
|
* Provides methods for connection management, audio streaming, and session control.
|
|
2178
|
+
*
|
|
2179
|
+
* NOTE for maintainers: `ISimplifiedVGFRecognitionClient` extends this interface,
|
|
2180
|
+
* so any method added here must also be implemented (typically as a delegate) by
|
|
2181
|
+
* `SimplifiedVGFRecognitionClient`. TypeScript will flag missing delegates at
|
|
2182
|
+
* compile time — do not work around the error, add the delegate.
|
|
2162
2183
|
*/
|
|
2163
2184
|
interface IRecognitionClient {
|
|
2164
2185
|
/**
|
|
@@ -2173,6 +2194,33 @@ interface IRecognitionClient {
|
|
|
2173
2194
|
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
2174
2195
|
*/
|
|
2175
2196
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2197
|
+
/**
|
|
2198
|
+
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
2199
|
+
* downsamples to the session's target rate (currently 16 kHz, set by the
|
|
2200
|
+
* server validator) before transmitting.
|
|
2201
|
+
*
|
|
2202
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
2203
|
+
* native rate (browser `AudioContext` is typically 44.1 kHz or 48 kHz).
|
|
2204
|
+
* If your audio is already at the target rate, prefer `sendAudio()` to
|
|
2205
|
+
* skip the resample step.
|
|
2206
|
+
*
|
|
2207
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
2208
|
+
* mixed to mono by the caller.
|
|
2209
|
+
*
|
|
2210
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
2211
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
2212
|
+
*/
|
|
2213
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2214
|
+
/**
|
|
2215
|
+
* Send prefix audio (e.g. a TTS prompt) that primes the provider's language
|
|
2216
|
+
* model before user audio is streamed. Chunks accepted — the server buffers
|
|
2217
|
+
* until the session is READY and flushes. Must be sent BEFORE the first
|
|
2218
|
+
* `sendAudio()` to take effect. Only meaningful when
|
|
2219
|
+
* `asrRequestConfig.prefixMode === PrefixMode.CLIENT`.
|
|
2220
|
+
*
|
|
2221
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
2222
|
+
*/
|
|
2223
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2176
2224
|
/**
|
|
2177
2225
|
* Stop recording and wait for final transcript
|
|
2178
2226
|
* The server will close the connection after sending the final transcript.
|
|
@@ -2378,6 +2426,29 @@ declare class RealTimeTwoWayWebSocketRecognitionClient extends WebSocketAudioCli
|
|
|
2378
2426
|
*/
|
|
2379
2427
|
private connectWithRetry;
|
|
2380
2428
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2429
|
+
/**
|
|
2430
|
+
* Send PCM16 mono audio captured at any sample rate. The SDK downsamples
|
|
2431
|
+
* to the session's target rate (currently 16 kHz per server validator)
|
|
2432
|
+
* before sending.
|
|
2433
|
+
*
|
|
2434
|
+
* Use this when your capture pipeline produces audio at the system's
|
|
2435
|
+
* native rate — `AudioContext` defaults to 44.1 kHz or 48 kHz on most
|
|
2436
|
+
* desktop/mobile hardware — and you don't want to bring your own
|
|
2437
|
+
* resampler. If your audio is already at the target rate, prefer
|
|
2438
|
+
* `sendAudio()` to skip the resample step.
|
|
2439
|
+
*
|
|
2440
|
+
* Algorithm: box-filter averaging (see audio-resampler.ts). Cheap, no
|
|
2441
|
+
* dependencies, has a built-in low-pass effect so aliasing stays out of
|
|
2442
|
+
* the speech band. Suitable for ASR; not a substitute for a high-quality
|
|
2443
|
+
* resampler if you're doing music or full-fidelity processing.
|
|
2444
|
+
*
|
|
2445
|
+
* Audio must be signed 16-bit little-endian PCM, mono. Stereo must be
|
|
2446
|
+
* mixed to mono by the caller.
|
|
2447
|
+
*
|
|
2448
|
+
* @param audioData - PCM16 mono audio at `sourceSampleRate`.
|
|
2449
|
+
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
2450
|
+
*/
|
|
2451
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2381
2452
|
private sendAudioInternal;
|
|
2382
2453
|
/**
|
|
2383
2454
|
* Only active ehwne client is in READY state. otherwise it will return immediately.
|
|
@@ -2690,7 +2761,42 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2690
2761
|
finalConfidence: z.ZodOptional<z.ZodNumber>;
|
|
2691
2762
|
voiceEnd: z.ZodOptional<z.ZodNumber>;
|
|
2692
2763
|
lastNonSilence: z.ZodOptional<z.ZodNumber>;
|
|
2764
|
+
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
2693
2765
|
asrConfig: z.ZodOptional<z.ZodString>;
|
|
2766
|
+
sessionConfigured: z.ZodOptional<z.ZodObject<{
|
|
2767
|
+
type: z.ZodLiteral<RecognitionResultTypeV1.SESSION_CONFIGURED>;
|
|
2768
|
+
audioUtteranceId: z.ZodString;
|
|
2769
|
+
provider: z.ZodOptional<z.ZodString>;
|
|
2770
|
+
model: z.ZodOptional<z.ZodString>;
|
|
2771
|
+
sampleRate: z.ZodOptional<z.ZodNumber>;
|
|
2772
|
+
encoding: z.ZodOptional<z.ZodString>;
|
|
2773
|
+
apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
|
|
2774
|
+
isFallback: z.ZodOptional<z.ZodBoolean>;
|
|
2775
|
+
asrRequest: z.ZodOptional<z.ZodString>;
|
|
2776
|
+
providerConfig: z.ZodOptional<z.ZodString>;
|
|
2777
|
+
}, "strip", z.ZodTypeAny, {
|
|
2778
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2779
|
+
audioUtteranceId: string;
|
|
2780
|
+
provider?: string | undefined;
|
|
2781
|
+
model?: string | undefined;
|
|
2782
|
+
sampleRate?: number | undefined;
|
|
2783
|
+
encoding?: string | undefined;
|
|
2784
|
+
apiType?: ASRApiType | undefined;
|
|
2785
|
+
isFallback?: boolean | undefined;
|
|
2786
|
+
asrRequest?: string | undefined;
|
|
2787
|
+
providerConfig?: string | undefined;
|
|
2788
|
+
}, {
|
|
2789
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2790
|
+
audioUtteranceId: string;
|
|
2791
|
+
provider?: string | undefined;
|
|
2792
|
+
model?: string | undefined;
|
|
2793
|
+
sampleRate?: number | undefined;
|
|
2794
|
+
encoding?: string | undefined;
|
|
2795
|
+
apiType?: ASRApiType | undefined;
|
|
2796
|
+
isFallback?: boolean | undefined;
|
|
2797
|
+
asrRequest?: string | undefined;
|
|
2798
|
+
providerConfig?: string | undefined;
|
|
2799
|
+
}>>;
|
|
2694
2800
|
startRecordingTimestamp: z.ZodOptional<z.ZodString>;
|
|
2695
2801
|
finalRecordingTimestamp: z.ZodOptional<z.ZodString>;
|
|
2696
2802
|
finalTranscriptionTimestamp: z.ZodOptional<z.ZodString>;
|
|
@@ -2700,6 +2806,28 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2700
2806
|
functionCallConfidence: z.ZodOptional<z.ZodNumber>;
|
|
2701
2807
|
finalFunctionCallTimestamp: z.ZodOptional<z.ZodString>;
|
|
2702
2808
|
promptSlotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
|
|
2809
|
+
promptSTT: z.ZodOptional<z.ZodString>;
|
|
2810
|
+
promptSTF: z.ZodOptional<z.ZodString>;
|
|
2811
|
+
promptTTF: z.ZodOptional<z.ZodString>;
|
|
2812
|
+
detections: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
2813
|
+
type: z.ZodNativeEnum<typeof DetectionTypeV1>;
|
|
2814
|
+
query: z.ZodString;
|
|
2815
|
+
score: z.ZodNumber;
|
|
2816
|
+
startMs: z.ZodOptional<z.ZodNumber>;
|
|
2817
|
+
endMs: z.ZodOptional<z.ZodNumber>;
|
|
2818
|
+
}, "strip", z.ZodTypeAny, {
|
|
2819
|
+
type: DetectionTypeV1;
|
|
2820
|
+
query: string;
|
|
2821
|
+
score: number;
|
|
2822
|
+
startMs?: number | undefined;
|
|
2823
|
+
endMs?: number | undefined;
|
|
2824
|
+
}, {
|
|
2825
|
+
type: DetectionTypeV1;
|
|
2826
|
+
query: string;
|
|
2827
|
+
score: number;
|
|
2828
|
+
startMs?: number | undefined;
|
|
2829
|
+
endMs?: number | undefined;
|
|
2830
|
+
}>, "many">>;
|
|
2703
2831
|
recognitionActionProcessingState: z.ZodOptional<z.ZodString>;
|
|
2704
2832
|
}, "strip", z.ZodTypeAny, {
|
|
2705
2833
|
audioUtteranceId: string;
|
|
@@ -2710,7 +2838,20 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2710
2838
|
finalConfidence?: number | undefined;
|
|
2711
2839
|
voiceEnd?: number | undefined;
|
|
2712
2840
|
lastNonSilence?: number | undefined;
|
|
2841
|
+
accumulatedAudioTimeMs?: number | undefined;
|
|
2713
2842
|
asrConfig?: string | undefined;
|
|
2843
|
+
sessionConfigured?: {
|
|
2844
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2845
|
+
audioUtteranceId: string;
|
|
2846
|
+
provider?: string | undefined;
|
|
2847
|
+
model?: string | undefined;
|
|
2848
|
+
sampleRate?: number | undefined;
|
|
2849
|
+
encoding?: string | undefined;
|
|
2850
|
+
apiType?: ASRApiType | undefined;
|
|
2851
|
+
isFallback?: boolean | undefined;
|
|
2852
|
+
asrRequest?: string | undefined;
|
|
2853
|
+
providerConfig?: string | undefined;
|
|
2854
|
+
} | undefined;
|
|
2714
2855
|
startRecordingTimestamp?: string | undefined;
|
|
2715
2856
|
finalRecordingTimestamp?: string | undefined;
|
|
2716
2857
|
finalTranscriptionTimestamp?: string | undefined;
|
|
@@ -2719,6 +2860,16 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2719
2860
|
functionCallConfidence?: number | undefined;
|
|
2720
2861
|
finalFunctionCallTimestamp?: string | undefined;
|
|
2721
2862
|
promptSlotMap?: Record<string, string[]> | undefined;
|
|
2863
|
+
promptSTT?: string | undefined;
|
|
2864
|
+
promptSTF?: string | undefined;
|
|
2865
|
+
promptTTF?: string | undefined;
|
|
2866
|
+
detections?: {
|
|
2867
|
+
type: DetectionTypeV1;
|
|
2868
|
+
query: string;
|
|
2869
|
+
score: number;
|
|
2870
|
+
startMs?: number | undefined;
|
|
2871
|
+
endMs?: number | undefined;
|
|
2872
|
+
}[] | undefined;
|
|
2722
2873
|
recognitionActionProcessingState?: string | undefined;
|
|
2723
2874
|
}, {
|
|
2724
2875
|
audioUtteranceId: string;
|
|
@@ -2728,7 +2879,20 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2728
2879
|
finalConfidence?: number | undefined;
|
|
2729
2880
|
voiceEnd?: number | undefined;
|
|
2730
2881
|
lastNonSilence?: number | undefined;
|
|
2882
|
+
accumulatedAudioTimeMs?: number | undefined;
|
|
2731
2883
|
asrConfig?: string | undefined;
|
|
2884
|
+
sessionConfigured?: {
|
|
2885
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2886
|
+
audioUtteranceId: string;
|
|
2887
|
+
provider?: string | undefined;
|
|
2888
|
+
model?: string | undefined;
|
|
2889
|
+
sampleRate?: number | undefined;
|
|
2890
|
+
encoding?: string | undefined;
|
|
2891
|
+
apiType?: ASRApiType | undefined;
|
|
2892
|
+
isFallback?: boolean | undefined;
|
|
2893
|
+
asrRequest?: string | undefined;
|
|
2894
|
+
providerConfig?: string | undefined;
|
|
2895
|
+
} | undefined;
|
|
2732
2896
|
startRecordingTimestamp?: string | undefined;
|
|
2733
2897
|
finalRecordingTimestamp?: string | undefined;
|
|
2734
2898
|
finalTranscriptionTimestamp?: string | undefined;
|
|
@@ -2738,6 +2902,16 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2738
2902
|
functionCallConfidence?: number | undefined;
|
|
2739
2903
|
finalFunctionCallTimestamp?: string | undefined;
|
|
2740
2904
|
promptSlotMap?: Record<string, string[]> | undefined;
|
|
2905
|
+
promptSTT?: string | undefined;
|
|
2906
|
+
promptSTF?: string | undefined;
|
|
2907
|
+
promptTTF?: string | undefined;
|
|
2908
|
+
detections?: {
|
|
2909
|
+
type: DetectionTypeV1;
|
|
2910
|
+
query: string;
|
|
2911
|
+
score: number;
|
|
2912
|
+
startMs?: number | undefined;
|
|
2913
|
+
endMs?: number | undefined;
|
|
2914
|
+
}[] | undefined;
|
|
2741
2915
|
recognitionActionProcessingState?: string | undefined;
|
|
2742
2916
|
}>;
|
|
2743
2917
|
type RecognitionState = z.infer<typeof RecognitionVGFStateSchema>;
|
|
@@ -2787,94 +2961,22 @@ interface SimplifiedVGFClientConfig extends IRecognitionClientConfig {
|
|
|
2787
2961
|
/**
|
|
2788
2962
|
* Interface for SimplifiedVGFRecognitionClient
|
|
2789
2963
|
*
|
|
2790
|
-
*
|
|
2791
|
-
*
|
|
2964
|
+
* Inherits the full IRecognitionClient surface (connect, sendAudio,
|
|
2965
|
+
* sendAudioWithSampleRate, sendPrefixAudio, stopRecording, stopAbnormally,
|
|
2966
|
+
* status checks, sendGameContext, getStats, getUrl, getState, getAudioUtteranceId)
|
|
2967
|
+
* — see recognition-client.types.ts for those. Adds VGF-specific state access.
|
|
2968
|
+
*
|
|
2969
|
+
* Extending IRecognitionClient (rather than redeclaring methods) means
|
|
2970
|
+
* TypeScript catches any base-client method that's not delegated by the
|
|
2971
|
+
* VGF wrapper at compile time — keeps the two surfaces in sync.
|
|
2792
2972
|
*/
|
|
2793
|
-
interface ISimplifiedVGFRecognitionClient {
|
|
2794
|
-
/**
|
|
2795
|
-
* Connect to the recognition service WebSocket
|
|
2796
|
-
* @returns Promise that resolves when connected and ready
|
|
2797
|
-
*/
|
|
2798
|
-
connect(): Promise<void>;
|
|
2799
|
-
/**
|
|
2800
|
-
* Send audio data for transcription
|
|
2801
|
-
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
2802
|
-
*/
|
|
2803
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2804
|
-
/**
|
|
2805
|
-
* Stop recording and wait for final transcription
|
|
2806
|
-
* @returns Promise that resolves when transcription is complete
|
|
2807
|
-
*/
|
|
2808
|
-
stopRecording(): Promise<void>;
|
|
2809
|
-
/**
|
|
2810
|
-
* Force stop and immediately close connection without waiting for server
|
|
2811
|
-
*
|
|
2812
|
-
* WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
|
|
2813
|
-
* - Does NOT wait for server to process remaining audio
|
|
2814
|
-
* - Does NOT receive final transcript from server (VGF state set to empty)
|
|
2815
|
-
* - Immediately closes WebSocket connection
|
|
2816
|
-
* - Cleans up resources (buffers, listeners)
|
|
2817
|
-
*
|
|
2818
|
-
* Use Cases:
|
|
2819
|
-
* - User explicitly cancels/abandons the session
|
|
2820
|
-
* - Timeout scenarios where waiting is not acceptable
|
|
2821
|
-
* - Need immediate cleanup and can't wait for server
|
|
2822
|
-
*
|
|
2823
|
-
* RECOMMENDED: Use stopRecording() for normal shutdown.
|
|
2824
|
-
* Only use this when immediate disconnection is required.
|
|
2825
|
-
*/
|
|
2826
|
-
stopAbnormally(): void;
|
|
2973
|
+
interface ISimplifiedVGFRecognitionClient extends IRecognitionClient {
|
|
2827
2974
|
/**
|
|
2828
|
-
* Get the current VGF recognition state
|
|
2975
|
+
* Get the current VGF recognition state — the single shared store
|
|
2976
|
+
* of inputs and outputs for this utterance.
|
|
2829
2977
|
* @returns Current RecognitionState with all transcription data
|
|
2830
2978
|
*/
|
|
2831
2979
|
getVGFState(): RecognitionState;
|
|
2832
|
-
/**
|
|
2833
|
-
* Check if connected to the WebSocket
|
|
2834
|
-
*/
|
|
2835
|
-
isConnected(): boolean;
|
|
2836
|
-
/**
|
|
2837
|
-
* Check if currently connecting
|
|
2838
|
-
*/
|
|
2839
|
-
isConnecting(): boolean;
|
|
2840
|
-
/**
|
|
2841
|
-
* Check if currently stopping
|
|
2842
|
-
*/
|
|
2843
|
-
isStopping(): boolean;
|
|
2844
|
-
/**
|
|
2845
|
-
* Check if transcription has finished
|
|
2846
|
-
*/
|
|
2847
|
-
isTranscriptionFinished(): boolean;
|
|
2848
|
-
/**
|
|
2849
|
-
* Check if the audio buffer has overflowed
|
|
2850
|
-
*/
|
|
2851
|
-
isBufferOverflowing(): boolean;
|
|
2852
|
-
/**
|
|
2853
|
-
* Send game context after connection is established (for preconnect flow).
|
|
2854
|
-
*
|
|
2855
|
-
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
2856
|
-
* WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
|
|
2857
|
-
*
|
|
2858
|
-
* @param context - Game context including slotMap for keyword boosting
|
|
2859
|
-
*/
|
|
2860
|
-
sendGameContext(context: GameContextV1): void;
|
|
2861
|
-
/**
|
|
2862
|
-
* Check if server has sent READY signal (provider connected, ready for audio).
|
|
2863
|
-
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
2864
|
-
*/
|
|
2865
|
-
isServerReady(): boolean;
|
|
2866
|
-
/**
|
|
2867
|
-
* Get the audio utterance ID for this session
|
|
2868
|
-
*/
|
|
2869
|
-
getAudioUtteranceId(): string;
|
|
2870
|
-
/**
|
|
2871
|
-
* Get the WebSocket URL being used
|
|
2872
|
-
*/
|
|
2873
|
-
getUrl(): string;
|
|
2874
|
-
/**
|
|
2875
|
-
* Get the underlying client state (for advanced usage)
|
|
2876
|
-
*/
|
|
2877
|
-
getState(): ClientState;
|
|
2878
2980
|
}
|
|
2879
2981
|
/**
|
|
2880
2982
|
* This wrapper ONLY maintains VGF state as a sink.
|
|
@@ -2891,6 +2993,15 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
|
|
|
2891
2993
|
constructor(config: SimplifiedVGFClientConfig);
|
|
2892
2994
|
connect(): Promise<void>;
|
|
2893
2995
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2996
|
+
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2997
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2998
|
+
getStats(): IRecognitionClientStats;
|
|
2999
|
+
/**
|
|
3000
|
+
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
3001
|
+
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
3002
|
+
* `isRecordingAudio`.
|
|
3003
|
+
*/
|
|
3004
|
+
private markRecordingStarted;
|
|
2894
3005
|
stopRecording(): Promise<void>;
|
|
2895
3006
|
stopAbnormally(): void;
|
|
2896
3007
|
getAudioUtteranceId(): string;
|