@volley/recognition-client-sdk 0.1.799 → 0.1.800
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +15 -0
- package/dist/index.bundled.d.ts +131 -91
- package/dist/index.js +76 -7
- package/dist/index.js.map +2 -2
- package/dist/recognition-client.types.d.ts +15 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +14 -92
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/vgf-recognition-mapper.d.ts +9 -17
- package/dist/vgf-recognition-mapper.d.ts.map +1 -1
- package/dist/vgf-recognition-state.d.ts +103 -0
- package/dist/vgf-recognition-state.d.ts.map +1 -1
- package/package.json +3 -3
- package/src/recognition-client.types.ts +16 -0
- package/src/simplified-vgf-recognition-client.spec.ts +0 -27
- package/src/simplified-vgf-recognition-client.ts +65 -122
- package/src/vgf-recognition-mapper.spec.ts +143 -0
- package/src/vgf-recognition-mapper.ts +35 -45
- package/src/vgf-recognition-state.ts +19 -1
|
@@ -1406,6 +1406,11 @@ interface IRecognitionClientConfig {
|
|
|
1406
1406
|
*
|
|
1407
1407
|
* Main interface for real-time speech recognition clients.
|
|
1408
1408
|
* Provides methods for connection management, audio streaming, and session control.
|
|
1409
|
+
*
|
|
1410
|
+
* NOTE for maintainers: `ISimplifiedVGFRecognitionClient` extends this interface,
|
|
1411
|
+
* so any method added here must also be implemented (typically as a delegate) by
|
|
1412
|
+
* `SimplifiedVGFRecognitionClient`. TypeScript will flag missing delegates at
|
|
1413
|
+
* compile time — do not work around the error, add the delegate.
|
|
1409
1414
|
*/
|
|
1410
1415
|
interface IRecognitionClient {
|
|
1411
1416
|
/**
|
|
@@ -1437,6 +1442,16 @@ interface IRecognitionClient {
|
|
|
1437
1442
|
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
1438
1443
|
*/
|
|
1439
1444
|
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
1445
|
+
/**
|
|
1446
|
+
* Send prefix audio (e.g. a TTS prompt) that primes the provider's language
|
|
1447
|
+
* model before user audio is streamed. Chunks accepted — the server buffers
|
|
1448
|
+
* until the session is READY and flushes. Must be sent BEFORE the first
|
|
1449
|
+
* `sendAudio()` to take effect. Only meaningful when
|
|
1450
|
+
* `asrRequestConfig.prefixMode === PrefixMode.CLIENT`.
|
|
1451
|
+
*
|
|
1452
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
1453
|
+
*/
|
|
1454
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
1440
1455
|
/**
|
|
1441
1456
|
* Stop recording and wait for final transcript
|
|
1442
1457
|
* The server will close the connection after sending the final transcript.
|
package/dist/index.bundled.d.ts
CHANGED
|
@@ -2175,6 +2175,11 @@ interface IRecognitionClientConfig {
|
|
|
2175
2175
|
*
|
|
2176
2176
|
* Main interface for real-time speech recognition clients.
|
|
2177
2177
|
* Provides methods for connection management, audio streaming, and session control.
|
|
2178
|
+
*
|
|
2179
|
+
* NOTE for maintainers: `ISimplifiedVGFRecognitionClient` extends this interface,
|
|
2180
|
+
* so any method added here must also be implemented (typically as a delegate) by
|
|
2181
|
+
* `SimplifiedVGFRecognitionClient`. TypeScript will flag missing delegates at
|
|
2182
|
+
* compile time — do not work around the error, add the delegate.
|
|
2178
2183
|
*/
|
|
2179
2184
|
interface IRecognitionClient {
|
|
2180
2185
|
/**
|
|
@@ -2206,6 +2211,16 @@ interface IRecognitionClient {
|
|
|
2206
2211
|
* @param sourceSampleRate - Source sample rate in Hz (e.g. 44100, 48000).
|
|
2207
2212
|
*/
|
|
2208
2213
|
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2214
|
+
/**
|
|
2215
|
+
* Send prefix audio (e.g. a TTS prompt) that primes the provider's language
|
|
2216
|
+
* model before user audio is streamed. Chunks accepted — the server buffers
|
|
2217
|
+
* until the session is READY and flushes. Must be sent BEFORE the first
|
|
2218
|
+
* `sendAudio()` to take effect. Only meaningful when
|
|
2219
|
+
* `asrRequestConfig.prefixMode === PrefixMode.CLIENT`.
|
|
2220
|
+
*
|
|
2221
|
+
* @param audioData - PCM audio data as ArrayBuffer, typed array view, or Blob
|
|
2222
|
+
*/
|
|
2223
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2209
2224
|
/**
|
|
2210
2225
|
* Stop recording and wait for final transcript
|
|
2211
2226
|
* The server will close the connection after sending the final transcript.
|
|
@@ -2746,7 +2761,42 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2746
2761
|
finalConfidence: z.ZodOptional<z.ZodNumber>;
|
|
2747
2762
|
voiceEnd: z.ZodOptional<z.ZodNumber>;
|
|
2748
2763
|
lastNonSilence: z.ZodOptional<z.ZodNumber>;
|
|
2764
|
+
accumulatedAudioTimeMs: z.ZodOptional<z.ZodNumber>;
|
|
2749
2765
|
asrConfig: z.ZodOptional<z.ZodString>;
|
|
2766
|
+
sessionConfigured: z.ZodOptional<z.ZodObject<{
|
|
2767
|
+
type: z.ZodLiteral<RecognitionResultTypeV1.SESSION_CONFIGURED>;
|
|
2768
|
+
audioUtteranceId: z.ZodString;
|
|
2769
|
+
provider: z.ZodOptional<z.ZodString>;
|
|
2770
|
+
model: z.ZodOptional<z.ZodString>;
|
|
2771
|
+
sampleRate: z.ZodOptional<z.ZodNumber>;
|
|
2772
|
+
encoding: z.ZodOptional<z.ZodString>;
|
|
2773
|
+
apiType: z.ZodOptional<z.ZodNativeEnum<typeof ASRApiType>>;
|
|
2774
|
+
isFallback: z.ZodOptional<z.ZodBoolean>;
|
|
2775
|
+
asrRequest: z.ZodOptional<z.ZodString>;
|
|
2776
|
+
providerConfig: z.ZodOptional<z.ZodString>;
|
|
2777
|
+
}, "strip", z.ZodTypeAny, {
|
|
2778
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2779
|
+
audioUtteranceId: string;
|
|
2780
|
+
provider?: string | undefined;
|
|
2781
|
+
model?: string | undefined;
|
|
2782
|
+
sampleRate?: number | undefined;
|
|
2783
|
+
encoding?: string | undefined;
|
|
2784
|
+
apiType?: ASRApiType | undefined;
|
|
2785
|
+
isFallback?: boolean | undefined;
|
|
2786
|
+
asrRequest?: string | undefined;
|
|
2787
|
+
providerConfig?: string | undefined;
|
|
2788
|
+
}, {
|
|
2789
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2790
|
+
audioUtteranceId: string;
|
|
2791
|
+
provider?: string | undefined;
|
|
2792
|
+
model?: string | undefined;
|
|
2793
|
+
sampleRate?: number | undefined;
|
|
2794
|
+
encoding?: string | undefined;
|
|
2795
|
+
apiType?: ASRApiType | undefined;
|
|
2796
|
+
isFallback?: boolean | undefined;
|
|
2797
|
+
asrRequest?: string | undefined;
|
|
2798
|
+
providerConfig?: string | undefined;
|
|
2799
|
+
}>>;
|
|
2750
2800
|
startRecordingTimestamp: z.ZodOptional<z.ZodString>;
|
|
2751
2801
|
finalRecordingTimestamp: z.ZodOptional<z.ZodString>;
|
|
2752
2802
|
finalTranscriptionTimestamp: z.ZodOptional<z.ZodString>;
|
|
@@ -2756,6 +2806,28 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2756
2806
|
functionCallConfidence: z.ZodOptional<z.ZodNumber>;
|
|
2757
2807
|
finalFunctionCallTimestamp: z.ZodOptional<z.ZodString>;
|
|
2758
2808
|
promptSlotMap: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>>;
|
|
2809
|
+
promptSTT: z.ZodOptional<z.ZodString>;
|
|
2810
|
+
promptSTF: z.ZodOptional<z.ZodString>;
|
|
2811
|
+
promptTTF: z.ZodOptional<z.ZodString>;
|
|
2812
|
+
detections: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
2813
|
+
type: z.ZodNativeEnum<typeof DetectionTypeV1>;
|
|
2814
|
+
query: z.ZodString;
|
|
2815
|
+
score: z.ZodNumber;
|
|
2816
|
+
startMs: z.ZodOptional<z.ZodNumber>;
|
|
2817
|
+
endMs: z.ZodOptional<z.ZodNumber>;
|
|
2818
|
+
}, "strip", z.ZodTypeAny, {
|
|
2819
|
+
type: DetectionTypeV1;
|
|
2820
|
+
query: string;
|
|
2821
|
+
score: number;
|
|
2822
|
+
startMs?: number | undefined;
|
|
2823
|
+
endMs?: number | undefined;
|
|
2824
|
+
}, {
|
|
2825
|
+
type: DetectionTypeV1;
|
|
2826
|
+
query: string;
|
|
2827
|
+
score: number;
|
|
2828
|
+
startMs?: number | undefined;
|
|
2829
|
+
endMs?: number | undefined;
|
|
2830
|
+
}>, "many">>;
|
|
2759
2831
|
recognitionActionProcessingState: z.ZodOptional<z.ZodString>;
|
|
2760
2832
|
}, "strip", z.ZodTypeAny, {
|
|
2761
2833
|
audioUtteranceId: string;
|
|
@@ -2766,7 +2838,20 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2766
2838
|
finalConfidence?: number | undefined;
|
|
2767
2839
|
voiceEnd?: number | undefined;
|
|
2768
2840
|
lastNonSilence?: number | undefined;
|
|
2841
|
+
accumulatedAudioTimeMs?: number | undefined;
|
|
2769
2842
|
asrConfig?: string | undefined;
|
|
2843
|
+
sessionConfigured?: {
|
|
2844
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2845
|
+
audioUtteranceId: string;
|
|
2846
|
+
provider?: string | undefined;
|
|
2847
|
+
model?: string | undefined;
|
|
2848
|
+
sampleRate?: number | undefined;
|
|
2849
|
+
encoding?: string | undefined;
|
|
2850
|
+
apiType?: ASRApiType | undefined;
|
|
2851
|
+
isFallback?: boolean | undefined;
|
|
2852
|
+
asrRequest?: string | undefined;
|
|
2853
|
+
providerConfig?: string | undefined;
|
|
2854
|
+
} | undefined;
|
|
2770
2855
|
startRecordingTimestamp?: string | undefined;
|
|
2771
2856
|
finalRecordingTimestamp?: string | undefined;
|
|
2772
2857
|
finalTranscriptionTimestamp?: string | undefined;
|
|
@@ -2775,6 +2860,16 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2775
2860
|
functionCallConfidence?: number | undefined;
|
|
2776
2861
|
finalFunctionCallTimestamp?: string | undefined;
|
|
2777
2862
|
promptSlotMap?: Record<string, string[]> | undefined;
|
|
2863
|
+
promptSTT?: string | undefined;
|
|
2864
|
+
promptSTF?: string | undefined;
|
|
2865
|
+
promptTTF?: string | undefined;
|
|
2866
|
+
detections?: {
|
|
2867
|
+
type: DetectionTypeV1;
|
|
2868
|
+
query: string;
|
|
2869
|
+
score: number;
|
|
2870
|
+
startMs?: number | undefined;
|
|
2871
|
+
endMs?: number | undefined;
|
|
2872
|
+
}[] | undefined;
|
|
2778
2873
|
recognitionActionProcessingState?: string | undefined;
|
|
2779
2874
|
}, {
|
|
2780
2875
|
audioUtteranceId: string;
|
|
@@ -2784,7 +2879,20 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2784
2879
|
finalConfidence?: number | undefined;
|
|
2785
2880
|
voiceEnd?: number | undefined;
|
|
2786
2881
|
lastNonSilence?: number | undefined;
|
|
2882
|
+
accumulatedAudioTimeMs?: number | undefined;
|
|
2787
2883
|
asrConfig?: string | undefined;
|
|
2884
|
+
sessionConfigured?: {
|
|
2885
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED;
|
|
2886
|
+
audioUtteranceId: string;
|
|
2887
|
+
provider?: string | undefined;
|
|
2888
|
+
model?: string | undefined;
|
|
2889
|
+
sampleRate?: number | undefined;
|
|
2890
|
+
encoding?: string | undefined;
|
|
2891
|
+
apiType?: ASRApiType | undefined;
|
|
2892
|
+
isFallback?: boolean | undefined;
|
|
2893
|
+
asrRequest?: string | undefined;
|
|
2894
|
+
providerConfig?: string | undefined;
|
|
2895
|
+
} | undefined;
|
|
2788
2896
|
startRecordingTimestamp?: string | undefined;
|
|
2789
2897
|
finalRecordingTimestamp?: string | undefined;
|
|
2790
2898
|
finalTranscriptionTimestamp?: string | undefined;
|
|
@@ -2794,6 +2902,16 @@ declare const RecognitionVGFStateSchema: z.ZodObject<{
|
|
|
2794
2902
|
functionCallConfidence?: number | undefined;
|
|
2795
2903
|
finalFunctionCallTimestamp?: string | undefined;
|
|
2796
2904
|
promptSlotMap?: Record<string, string[]> | undefined;
|
|
2905
|
+
promptSTT?: string | undefined;
|
|
2906
|
+
promptSTF?: string | undefined;
|
|
2907
|
+
promptTTF?: string | undefined;
|
|
2908
|
+
detections?: {
|
|
2909
|
+
type: DetectionTypeV1;
|
|
2910
|
+
query: string;
|
|
2911
|
+
score: number;
|
|
2912
|
+
startMs?: number | undefined;
|
|
2913
|
+
endMs?: number | undefined;
|
|
2914
|
+
}[] | undefined;
|
|
2797
2915
|
recognitionActionProcessingState?: string | undefined;
|
|
2798
2916
|
}>;
|
|
2799
2917
|
type RecognitionState = z.infer<typeof RecognitionVGFStateSchema>;
|
|
@@ -2843,102 +2961,22 @@ interface SimplifiedVGFClientConfig extends IRecognitionClientConfig {
|
|
|
2843
2961
|
/**
|
|
2844
2962
|
* Interface for SimplifiedVGFRecognitionClient
|
|
2845
2963
|
*
|
|
2846
|
-
*
|
|
2847
|
-
*
|
|
2964
|
+
* Inherits the full IRecognitionClient surface (connect, sendAudio,
|
|
2965
|
+
* sendAudioWithSampleRate, sendPrefixAudio, stopRecording, stopAbnormally,
|
|
2966
|
+
* status checks, sendGameContext, getStats, getUrl, getState, getAudioUtteranceId)
|
|
2967
|
+
* — see recognition-client.types.ts for those. Adds VGF-specific state access.
|
|
2968
|
+
*
|
|
2969
|
+
* Extending IRecognitionClient (rather than redeclaring methods) means
|
|
2970
|
+
* TypeScript catches any base-client method that's not delegated by the
|
|
2971
|
+
* VGF wrapper at compile time — keeps the two surfaces in sync.
|
|
2848
2972
|
*/
|
|
2849
|
-
interface ISimplifiedVGFRecognitionClient {
|
|
2973
|
+
interface ISimplifiedVGFRecognitionClient extends IRecognitionClient {
|
|
2850
2974
|
/**
|
|
2851
|
-
*
|
|
2852
|
-
*
|
|
2853
|
-
*/
|
|
2854
|
-
connect(): Promise<void>;
|
|
2855
|
-
/**
|
|
2856
|
-
* Send audio data for transcription
|
|
2857
|
-
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
2858
|
-
*/
|
|
2859
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2860
|
-
/**
|
|
2861
|
-
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
2862
|
-
* downsamples to the session's target rate before transmitting. Use
|
|
2863
|
-
* when capture is at the system's native rate (browser AudioContext is
|
|
2864
|
-
* typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
|
|
2865
|
-
* little-endian PCM, mono.
|
|
2866
|
-
*/
|
|
2867
|
-
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2868
|
-
/**
|
|
2869
|
-
* Stop recording and wait for final transcription
|
|
2870
|
-
* @returns Promise that resolves when transcription is complete
|
|
2871
|
-
*/
|
|
2872
|
-
stopRecording(): Promise<void>;
|
|
2873
|
-
/**
|
|
2874
|
-
* Force stop and immediately close connection without waiting for server
|
|
2875
|
-
*
|
|
2876
|
-
* WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
|
|
2877
|
-
* - Does NOT wait for server to process remaining audio
|
|
2878
|
-
* - Does NOT receive final transcript from server (VGF state set to empty)
|
|
2879
|
-
* - Immediately closes WebSocket connection
|
|
2880
|
-
* - Cleans up resources (buffers, listeners)
|
|
2881
|
-
*
|
|
2882
|
-
* Use Cases:
|
|
2883
|
-
* - User explicitly cancels/abandons the session
|
|
2884
|
-
* - Timeout scenarios where waiting is not acceptable
|
|
2885
|
-
* - Need immediate cleanup and can't wait for server
|
|
2886
|
-
*
|
|
2887
|
-
* RECOMMENDED: Use stopRecording() for normal shutdown.
|
|
2888
|
-
* Only use this when immediate disconnection is required.
|
|
2889
|
-
*/
|
|
2890
|
-
stopAbnormally(): void;
|
|
2891
|
-
/**
|
|
2892
|
-
* Get the current VGF recognition state
|
|
2975
|
+
* Get the current VGF recognition state — the single shared store
|
|
2976
|
+
* of inputs and outputs for this utterance.
|
|
2893
2977
|
* @returns Current RecognitionState with all transcription data
|
|
2894
2978
|
*/
|
|
2895
2979
|
getVGFState(): RecognitionState;
|
|
2896
|
-
/**
|
|
2897
|
-
* Check if connected to the WebSocket
|
|
2898
|
-
*/
|
|
2899
|
-
isConnected(): boolean;
|
|
2900
|
-
/**
|
|
2901
|
-
* Check if currently connecting
|
|
2902
|
-
*/
|
|
2903
|
-
isConnecting(): boolean;
|
|
2904
|
-
/**
|
|
2905
|
-
* Check if currently stopping
|
|
2906
|
-
*/
|
|
2907
|
-
isStopping(): boolean;
|
|
2908
|
-
/**
|
|
2909
|
-
* Check if transcription has finished
|
|
2910
|
-
*/
|
|
2911
|
-
isTranscriptionFinished(): boolean;
|
|
2912
|
-
/**
|
|
2913
|
-
* Check if the audio buffer has overflowed
|
|
2914
|
-
*/
|
|
2915
|
-
isBufferOverflowing(): boolean;
|
|
2916
|
-
/**
|
|
2917
|
-
* Send game context after connection is established (for preconnect flow).
|
|
2918
|
-
*
|
|
2919
|
-
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
2920
|
-
* WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
|
|
2921
|
-
*
|
|
2922
|
-
* @param context - Game context including slotMap for keyword boosting
|
|
2923
|
-
*/
|
|
2924
|
-
sendGameContext(context: GameContextV1): void;
|
|
2925
|
-
/**
|
|
2926
|
-
* Check if server has sent READY signal (provider connected, ready for audio).
|
|
2927
|
-
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
2928
|
-
*/
|
|
2929
|
-
isServerReady(): boolean;
|
|
2930
|
-
/**
|
|
2931
|
-
* Get the audio utterance ID for this session
|
|
2932
|
-
*/
|
|
2933
|
-
getAudioUtteranceId(): string;
|
|
2934
|
-
/**
|
|
2935
|
-
* Get the WebSocket URL being used
|
|
2936
|
-
*/
|
|
2937
|
-
getUrl(): string;
|
|
2938
|
-
/**
|
|
2939
|
-
* Get the underlying client state (for advanced usage)
|
|
2940
|
-
*/
|
|
2941
|
-
getState(): ClientState;
|
|
2942
2980
|
}
|
|
2943
2981
|
/**
|
|
2944
2982
|
* This wrapper ONLY maintains VGF state as a sink.
|
|
@@ -2956,6 +2994,8 @@ declare class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognitio
|
|
|
2956
2994
|
connect(): Promise<void>;
|
|
2957
2995
|
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2958
2996
|
sendAudioWithSampleRate(audioData: ArrayBuffer | ArrayBufferView | Blob, sourceSampleRate: number): void;
|
|
2997
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
2998
|
+
getStats(): IRecognitionClientStats;
|
|
2959
2999
|
/**
|
|
2960
3000
|
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
2961
3001
|
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
package/dist/index.js
CHANGED
|
@@ -6435,9 +6435,15 @@ var RecognitionVGFStateSchema = z.object({
|
|
|
6435
6435
|
// voice end time identified by ASR
|
|
6436
6436
|
lastNonSilence: z.number().optional(),
|
|
6437
6437
|
// last non-silence sample time from PCM analysis
|
|
6438
|
+
accumulatedAudioTimeMs: z.number().optional(),
|
|
6439
|
+
// total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
|
|
6438
6440
|
// Tracking-only metadata
|
|
6439
6441
|
asrConfig: z.string().optional(),
|
|
6440
|
-
// Json format of the ASR config
|
|
6442
|
+
// Json format of the *requested* ASR config (set once at construction).
|
|
6443
|
+
// For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
|
|
6444
|
+
// server after circuit-breaker/fallback — see `sessionConfigured` below.
|
|
6445
|
+
sessionConfigured: SessionConfiguredSchemaV1.optional(),
|
|
6446
|
+
// Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
|
|
6441
6447
|
startRecordingTimestamp: z.string().optional(),
|
|
6442
6448
|
// Start of recording. Immutable after set.
|
|
6443
6449
|
finalRecordingTimestamp: z.string().optional(),
|
|
@@ -6458,6 +6464,17 @@ var RecognitionVGFStateSchema = z.object({
|
|
|
6458
6464
|
// Support for prompt slot mapping - passed to recognition context when present
|
|
6459
6465
|
promptSlotMap: z.record(z.string(), z.array(z.string())).optional(),
|
|
6460
6466
|
// Optional map of slot names to prompt values for recognition context
|
|
6467
|
+
// Optional prompt inputs - when set, forwarded into GameContext at client creation.
|
|
6468
|
+
// Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
|
|
6469
|
+
promptSTT: z.string().optional(),
|
|
6470
|
+
promptSTF: z.string().optional(),
|
|
6471
|
+
promptTTF: z.string().optional(),
|
|
6472
|
+
// Provider-reported phrase detections from the last transcript message.
|
|
6473
|
+
// Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
|
|
6474
|
+
// (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
|
|
6475
|
+
// Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
|
|
6476
|
+
// and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
|
|
6477
|
+
detections: z.array(DetectionV1Schema).optional(),
|
|
6461
6478
|
// Recognition action processing state - managed externally, SDK preserves but never modifies
|
|
6462
6479
|
recognitionActionProcessingState: z.string().optional()
|
|
6463
6480
|
// "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
|
|
@@ -6529,6 +6546,9 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
|
|
|
6529
6546
|
if (result.lastNonSilence !== void 0) {
|
|
6530
6547
|
newState.lastNonSilence = result.lastNonSilence;
|
|
6531
6548
|
}
|
|
6549
|
+
if (result.accumulatedAudioTimeMs !== void 0) {
|
|
6550
|
+
newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
|
|
6551
|
+
}
|
|
6532
6552
|
} else {
|
|
6533
6553
|
newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
|
|
6534
6554
|
newState.finalTranscript = result.finalTranscript || "";
|
|
@@ -6542,12 +6562,24 @@ function mapTranscriptionResultToState(currentState, result, isRecording) {
|
|
|
6542
6562
|
if (result.lastNonSilence !== void 0) {
|
|
6543
6563
|
newState.lastNonSilence = result.lastNonSilence;
|
|
6544
6564
|
}
|
|
6565
|
+
if (result.accumulatedAudioTimeMs !== void 0) {
|
|
6566
|
+
newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
|
|
6567
|
+
}
|
|
6545
6568
|
newState.pendingTranscript = "";
|
|
6546
6569
|
newState.pendingConfidence = void 0;
|
|
6547
6570
|
}
|
|
6571
|
+
if (result.detections !== void 0) {
|
|
6572
|
+
newState.detections = result.detections;
|
|
6573
|
+
}
|
|
6548
6574
|
return newState;
|
|
6549
6575
|
}
|
|
6550
|
-
function
|
|
6576
|
+
function mapSessionConfiguredToState(currentState, sessionConfigured) {
|
|
6577
|
+
return {
|
|
6578
|
+
...currentState,
|
|
6579
|
+
sessionConfigured
|
|
6580
|
+
};
|
|
6581
|
+
}
|
|
6582
|
+
function mapErrorToState(currentState) {
|
|
6551
6583
|
return {
|
|
6552
6584
|
...currentState,
|
|
6553
6585
|
transcriptionStatus: TranscriptionStatus.ERROR,
|
|
@@ -6579,7 +6611,10 @@ function resetRecognitionVGFState(currentState) {
|
|
|
6579
6611
|
recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
|
|
6580
6612
|
finalTranscript: void 0,
|
|
6581
6613
|
voiceEnd: void 0,
|
|
6582
|
-
lastNonSilence: void 0
|
|
6614
|
+
lastNonSilence: void 0,
|
|
6615
|
+
accumulatedAudioTimeMs: void 0,
|
|
6616
|
+
detections: void 0,
|
|
6617
|
+
sessionConfigured: void 0
|
|
6583
6618
|
};
|
|
6584
6619
|
}
|
|
6585
6620
|
function generateUUID() {
|
|
@@ -6624,16 +6659,28 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6624
6659
|
}
|
|
6625
6660
|
this.state = { ...this.state, startRecordingStatus: "READY" };
|
|
6626
6661
|
this.expectedUuid = this.state.audioUtteranceId;
|
|
6627
|
-
|
|
6662
|
+
const hasPromptInputs = this.state.promptSlotMap !== void 0 || this.state.promptSTT !== void 0 || this.state.promptSTF !== void 0 || this.state.promptTTF !== void 0;
|
|
6663
|
+
if (hasPromptInputs) {
|
|
6628
6664
|
if (clientConfig.asrRequestConfig) {
|
|
6629
6665
|
clientConfig.asrRequestConfig.useContext = true;
|
|
6630
6666
|
}
|
|
6631
6667
|
if (!clientConfig.gameContext) {
|
|
6632
6668
|
if (clientConfig.logger) {
|
|
6633
|
-
clientConfig.logger("warn", "[VGF]
|
|
6669
|
+
clientConfig.logger("warn", "[VGF] prompt inputs found but no gameContext provided. They will not be sent.");
|
|
6634
6670
|
}
|
|
6635
6671
|
} else {
|
|
6636
|
-
|
|
6672
|
+
if (this.state.promptSlotMap !== void 0) {
|
|
6673
|
+
clientConfig.gameContext.slotMap = this.state.promptSlotMap;
|
|
6674
|
+
}
|
|
6675
|
+
if (this.state.promptSTT !== void 0) {
|
|
6676
|
+
clientConfig.gameContext.promptSTT = this.state.promptSTT;
|
|
6677
|
+
}
|
|
6678
|
+
if (this.state.promptSTF !== void 0) {
|
|
6679
|
+
clientConfig.gameContext.promptSTF = this.state.promptSTF;
|
|
6680
|
+
}
|
|
6681
|
+
if (this.state.promptTTF !== void 0) {
|
|
6682
|
+
clientConfig.gameContext.promptTTF = this.state.promptTTF;
|
|
6683
|
+
}
|
|
6637
6684
|
}
|
|
6638
6685
|
}
|
|
6639
6686
|
this.client = new RealTimeTwoWayWebSocketRecognitionClient({
|
|
@@ -6669,6 +6716,22 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6669
6716
|
clientConfig.onMetadata(metadata);
|
|
6670
6717
|
}
|
|
6671
6718
|
},
|
|
6719
|
+
onSessionConfigured: (sessionConfigured) => {
|
|
6720
|
+
if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
|
|
6721
|
+
if (this.logger) {
|
|
6722
|
+
this.logger(
|
|
6723
|
+
"warn",
|
|
6724
|
+
`[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
|
|
6725
|
+
);
|
|
6726
|
+
}
|
|
6727
|
+
return;
|
|
6728
|
+
}
|
|
6729
|
+
this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
|
|
6730
|
+
this.notifyStateChange();
|
|
6731
|
+
if (clientConfig.onSessionConfigured) {
|
|
6732
|
+
clientConfig.onSessionConfigured(sessionConfigured);
|
|
6733
|
+
}
|
|
6734
|
+
},
|
|
6672
6735
|
onFunctionCall: (result) => {
|
|
6673
6736
|
if (clientConfig.onFunctionCall) {
|
|
6674
6737
|
clientConfig.onFunctionCall(result);
|
|
@@ -6685,7 +6748,7 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6685
6748
|
return;
|
|
6686
6749
|
}
|
|
6687
6750
|
this.isRecordingAudio = false;
|
|
6688
|
-
this.state = mapErrorToState(this.state
|
|
6751
|
+
this.state = mapErrorToState(this.state);
|
|
6689
6752
|
this.notifyStateChange();
|
|
6690
6753
|
if (clientConfig.onError) {
|
|
6691
6754
|
clientConfig.onError(error);
|
|
@@ -6717,6 +6780,12 @@ var SimplifiedVGFRecognitionClient = class {
|
|
|
6717
6780
|
this.markRecordingStarted();
|
|
6718
6781
|
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
6719
6782
|
}
|
|
6783
|
+
sendPrefixAudio(audioData) {
|
|
6784
|
+
this.client.sendPrefixAudio(audioData);
|
|
6785
|
+
}
|
|
6786
|
+
getStats() {
|
|
6787
|
+
return this.client.getStats();
|
|
6788
|
+
}
|
|
6720
6789
|
/**
|
|
6721
6790
|
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
6722
6791
|
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|