@volley/recognition-client-sdk 0.1.799 → 0.1.803
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser.bundled.d.ts +15 -0
- package/dist/index.bundled.d.ts +139 -93
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +96 -11
- package/dist/index.js.map +2 -2
- package/dist/recognition-client.types.d.ts +15 -0
- package/dist/recognition-client.types.d.ts.map +1 -1
- package/dist/simplified-vgf-recognition-client.d.ts +15 -94
- package/dist/simplified-vgf-recognition-client.d.ts.map +1 -1
- package/dist/vgf-recognition-mapper.d.ts +9 -17
- package/dist/vgf-recognition-mapper.d.ts.map +1 -1
- package/dist/vgf-recognition-state.d.ts +110 -0
- package/dist/vgf-recognition-state.d.ts.map +1 -1
- package/package.json +4 -4
- package/src/index.ts +1 -0
- package/src/recognition-client.types.ts +16 -0
- package/src/simplified-vgf-recognition-client.spec.ts +0 -27
- package/src/simplified-vgf-recognition-client.ts +84 -133
- package/src/vgf-recognition-mapper.spec.ts +143 -0
- package/src/vgf-recognition-mapper.ts +35 -45
- package/src/vgf-recognition-state.ts +44 -1
|
@@ -12,22 +12,25 @@ import {
|
|
|
12
12
|
RecognitionState,
|
|
13
13
|
TranscriptionStatus,
|
|
14
14
|
RecordingStatus,
|
|
15
|
-
RecognitionActionProcessingState
|
|
15
|
+
RecognitionActionProcessingState,
|
|
16
|
+
isTerminal
|
|
16
17
|
} from './vgf-recognition-state.js';
|
|
17
18
|
import {
|
|
18
19
|
IRecognitionClient,
|
|
19
20
|
IRecognitionClientConfig,
|
|
21
|
+
IRecognitionClientStats,
|
|
20
22
|
ClientState
|
|
21
23
|
} from './recognition-client.types.js';
|
|
22
24
|
import { RealTimeTwoWayWebSocketRecognitionClient } from './recognition-client.js';
|
|
23
25
|
import {
|
|
24
26
|
createVGFStateFromConfig,
|
|
25
27
|
mapTranscriptionResultToState,
|
|
28
|
+
mapSessionConfiguredToState,
|
|
26
29
|
mapErrorToState,
|
|
27
30
|
updateStateOnStop,
|
|
28
31
|
resetRecognitionVGFState
|
|
29
32
|
} from './vgf-recognition-mapper.js';
|
|
30
|
-
import type
|
|
33
|
+
import { RecognitionContextTypeV1, type GameContextV1 } from '@recog/shared-types';
|
|
31
34
|
|
|
32
35
|
/**
|
|
33
36
|
* Configuration for SimplifiedVGFRecognitionClient
|
|
@@ -49,126 +52,22 @@ export interface SimplifiedVGFClientConfig extends IRecognitionClientConfig {
|
|
|
49
52
|
/**
|
|
50
53
|
* Interface for SimplifiedVGFRecognitionClient
|
|
51
54
|
*
|
|
52
|
-
*
|
|
53
|
-
*
|
|
55
|
+
* Inherits the full IRecognitionClient surface (connect, sendAudio,
|
|
56
|
+
* sendAudioWithSampleRate, sendPrefixAudio, stopRecording, stopAbnormally,
|
|
57
|
+
* status checks, sendGameContext, getStats, getUrl, getState, getAudioUtteranceId)
|
|
58
|
+
* — see recognition-client.types.ts for those. Adds VGF-specific state access.
|
|
59
|
+
*
|
|
60
|
+
* Extending IRecognitionClient (rather than redeclaring methods) means
|
|
61
|
+
* TypeScript catches any base-client method that's not delegated by the
|
|
62
|
+
* VGF wrapper at compile time — keeps the two surfaces in sync.
|
|
54
63
|
*/
|
|
55
|
-
export interface ISimplifiedVGFRecognitionClient {
|
|
56
|
-
// ============= Core Connection Methods =============
|
|
57
|
-
/**
|
|
58
|
-
* Connect to the recognition service WebSocket
|
|
59
|
-
* @returns Promise that resolves when connected and ready
|
|
60
|
-
*/
|
|
61
|
-
connect(): Promise<void>;
|
|
62
|
-
|
|
63
|
-
/**
|
|
64
|
-
* Send audio data for transcription
|
|
65
|
-
* @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
|
|
66
|
-
*/
|
|
67
|
-
sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
|
|
68
|
-
|
|
69
|
-
/**
|
|
70
|
-
* Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
|
|
71
|
-
* downsamples to the session's target rate before transmitting. Use
|
|
72
|
-
* when capture is at the system's native rate (browser AudioContext is
|
|
73
|
-
* typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
|
|
74
|
-
* little-endian PCM, mono.
|
|
75
|
-
*/
|
|
76
|
-
sendAudioWithSampleRate(
|
|
77
|
-
audioData: ArrayBuffer | ArrayBufferView | Blob,
|
|
78
|
-
sourceSampleRate: number
|
|
79
|
-
): void;
|
|
80
|
-
|
|
81
|
-
/**
|
|
82
|
-
* Stop recording and wait for final transcription
|
|
83
|
-
* @returns Promise that resolves when transcription is complete
|
|
84
|
-
*/
|
|
85
|
-
stopRecording(): Promise<void>;
|
|
86
|
-
|
|
64
|
+
export interface ISimplifiedVGFRecognitionClient extends IRecognitionClient {
|
|
87
65
|
/**
|
|
88
|
-
*
|
|
89
|
-
*
|
|
90
|
-
* WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
|
|
91
|
-
* - Does NOT wait for server to process remaining audio
|
|
92
|
-
* - Does NOT receive final transcript from server (VGF state set to empty)
|
|
93
|
-
* - Immediately closes WebSocket connection
|
|
94
|
-
* - Cleans up resources (buffers, listeners)
|
|
95
|
-
*
|
|
96
|
-
* Use Cases:
|
|
97
|
-
* - User explicitly cancels/abandons the session
|
|
98
|
-
* - Timeout scenarios where waiting is not acceptable
|
|
99
|
-
* - Need immediate cleanup and can't wait for server
|
|
100
|
-
*
|
|
101
|
-
* RECOMMENDED: Use stopRecording() for normal shutdown.
|
|
102
|
-
* Only use this when immediate disconnection is required.
|
|
103
|
-
*/
|
|
104
|
-
stopAbnormally(): void;
|
|
105
|
-
|
|
106
|
-
// ============= VGF State Methods =============
|
|
107
|
-
/**
|
|
108
|
-
* Get the current VGF recognition state
|
|
66
|
+
* Get the current VGF recognition state — the single shared store
|
|
67
|
+
* of inputs and outputs for this utterance.
|
|
109
68
|
* @returns Current RecognitionState with all transcription data
|
|
110
69
|
*/
|
|
111
70
|
getVGFState(): RecognitionState;
|
|
112
|
-
|
|
113
|
-
// ============= Status Check Methods =============
|
|
114
|
-
/**
|
|
115
|
-
* Check if connected to the WebSocket
|
|
116
|
-
*/
|
|
117
|
-
isConnected(): boolean;
|
|
118
|
-
|
|
119
|
-
/**
|
|
120
|
-
* Check if currently connecting
|
|
121
|
-
*/
|
|
122
|
-
isConnecting(): boolean;
|
|
123
|
-
|
|
124
|
-
/**
|
|
125
|
-
* Check if currently stopping
|
|
126
|
-
*/
|
|
127
|
-
isStopping(): boolean;
|
|
128
|
-
|
|
129
|
-
/**
|
|
130
|
-
* Check if transcription has finished
|
|
131
|
-
*/
|
|
132
|
-
isTranscriptionFinished(): boolean;
|
|
133
|
-
|
|
134
|
-
/**
|
|
135
|
-
* Check if the audio buffer has overflowed
|
|
136
|
-
*/
|
|
137
|
-
isBufferOverflowing(): boolean;
|
|
138
|
-
|
|
139
|
-
// ============= Preconnect Methods =============
|
|
140
|
-
/**
|
|
141
|
-
* Send game context after connection is established (for preconnect flow).
|
|
142
|
-
*
|
|
143
|
-
* Preconnect flow: Create client with asrRequestConfig (useContext: true) but
|
|
144
|
-
* WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
|
|
145
|
-
*
|
|
146
|
-
* @param context - Game context including slotMap for keyword boosting
|
|
147
|
-
*/
|
|
148
|
-
sendGameContext(context: GameContextV1): void;
|
|
149
|
-
|
|
150
|
-
/**
|
|
151
|
-
* Check if server has sent READY signal (provider connected, ready for audio).
|
|
152
|
-
* In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
|
|
153
|
-
*/
|
|
154
|
-
isServerReady(): boolean;
|
|
155
|
-
|
|
156
|
-
// ============= Utility Methods =============
|
|
157
|
-
/**
|
|
158
|
-
* Get the audio utterance ID for this session
|
|
159
|
-
*/
|
|
160
|
-
getAudioUtteranceId(): string;
|
|
161
|
-
|
|
162
|
-
/**
|
|
163
|
-
* Get the WebSocket URL being used
|
|
164
|
-
*/
|
|
165
|
-
getUrl(): string;
|
|
166
|
-
|
|
167
|
-
/**
|
|
168
|
-
* Get the underlying client state (for advanced usage)
|
|
169
|
-
*/
|
|
170
|
-
getState(): ClientState;
|
|
171
|
-
|
|
172
71
|
}
|
|
173
72
|
|
|
174
73
|
/**
|
|
@@ -241,23 +140,51 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
241
140
|
// Track the expected UUID for this session
|
|
242
141
|
this.expectedUuid = this.state.audioUtteranceId;
|
|
243
142
|
|
|
244
|
-
//
|
|
245
|
-
|
|
143
|
+
// Backfill gameContext from state if the caller didn't pass one in config.
|
|
144
|
+
// Lets RecognitionState carry session identity (gameId + gamePhase) alongside
|
|
145
|
+
// runtime prompt inputs (promptSlotMap, promptSTT/STF/TTF), so a single
|
|
146
|
+
// initialState object is sufficient — no separate gameContext required.
|
|
147
|
+
// Backward-compatible: if both are provided, clientConfig.gameContext wins.
|
|
148
|
+
if (!clientConfig.gameContext && this.state.gameId && this.state.gamePhase) {
|
|
149
|
+
clientConfig.gameContext = {
|
|
150
|
+
type: RecognitionContextTypeV1.GAME_CONTEXT,
|
|
151
|
+
gameId: this.state.gameId,
|
|
152
|
+
gamePhase: this.state.gamePhase,
|
|
153
|
+
};
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Forward optional VGF inputs (promptSlotMap, promptSTT/STF/TTF) into the GameContext.
|
|
157
|
+
const hasPromptInputs =
|
|
158
|
+
this.state.promptSlotMap !== undefined ||
|
|
159
|
+
this.state.promptSTT !== undefined ||
|
|
160
|
+
this.state.promptSTF !== undefined ||
|
|
161
|
+
this.state.promptTTF !== undefined;
|
|
162
|
+
|
|
163
|
+
if (hasPromptInputs) {
|
|
246
164
|
// Set useContext=true in ASR config to enable context processing
|
|
247
165
|
if (clientConfig.asrRequestConfig) {
|
|
248
166
|
clientConfig.asrRequestConfig.useContext = true;
|
|
249
167
|
}
|
|
250
168
|
|
|
251
|
-
// Add promptSlotMap to gameContext
|
|
252
169
|
if (!clientConfig.gameContext) {
|
|
253
|
-
//
|
|
254
|
-
//
|
|
170
|
+
// No gameContext from config, and state didn't carry gameId+gamePhase
|
|
171
|
+
// either — prompt inputs have nowhere to ride. Warn and drop.
|
|
255
172
|
if (clientConfig.logger) {
|
|
256
|
-
clientConfig.logger('warn', '[VGF]
|
|
173
|
+
clientConfig.logger('warn', '[VGF] prompt inputs found but no gameContext provided and state has no gameId/gamePhase. They will not be sent.');
|
|
257
174
|
}
|
|
258
175
|
} else {
|
|
259
|
-
|
|
260
|
-
|
|
176
|
+
if (this.state.promptSlotMap !== undefined) {
|
|
177
|
+
clientConfig.gameContext.slotMap = this.state.promptSlotMap;
|
|
178
|
+
}
|
|
179
|
+
if (this.state.promptSTT !== undefined) {
|
|
180
|
+
clientConfig.gameContext.promptSTT = this.state.promptSTT;
|
|
181
|
+
}
|
|
182
|
+
if (this.state.promptSTF !== undefined) {
|
|
183
|
+
clientConfig.gameContext.promptSTF = this.state.promptSTF;
|
|
184
|
+
}
|
|
185
|
+
if (this.state.promptTTF !== undefined) {
|
|
186
|
+
clientConfig.gameContext.promptTTF = this.state.promptTTF;
|
|
187
|
+
}
|
|
261
188
|
}
|
|
262
189
|
}
|
|
263
190
|
|
|
@@ -303,6 +230,25 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
303
230
|
}
|
|
304
231
|
},
|
|
305
232
|
|
|
233
|
+
onSessionConfigured: (sessionConfigured): void => {
|
|
234
|
+
// Skip update if UUID doesn't match (stale callback from previous session)
|
|
235
|
+
if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
|
|
236
|
+
if (this.logger) {
|
|
237
|
+
this.logger('warn',
|
|
238
|
+
`[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
|
|
239
|
+
);
|
|
240
|
+
}
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
|
|
245
|
+
this.notifyStateChange();
|
|
246
|
+
|
|
247
|
+
if (clientConfig.onSessionConfigured) {
|
|
248
|
+
clientConfig.onSessionConfigured(sessionConfigured);
|
|
249
|
+
}
|
|
250
|
+
},
|
|
251
|
+
|
|
306
252
|
onFunctionCall: (result): void => {
|
|
307
253
|
// Pass through function call - no VGF state changes needed for P2 feature
|
|
308
254
|
if (clientConfig.onFunctionCall) {
|
|
@@ -322,7 +268,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
322
268
|
}
|
|
323
269
|
|
|
324
270
|
this.isRecordingAudio = false; // Reset on error
|
|
325
|
-
this.state = mapErrorToState(this.state
|
|
271
|
+
this.state = mapErrorToState(this.state);
|
|
326
272
|
this.notifyStateChange();
|
|
327
273
|
|
|
328
274
|
if (clientConfig.onError) {
|
|
@@ -367,6 +313,17 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
367
313
|
this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
|
|
368
314
|
}
|
|
369
315
|
|
|
316
|
+
sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void {
|
|
317
|
+
// Pure pass-through. Prefix audio is NOT user audio, so do not flip
|
|
318
|
+
// startRecordingStatus to RECORDING — that transition belongs to the
|
|
319
|
+
// first sendAudio() call.
|
|
320
|
+
this.client.sendPrefixAudio(audioData);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
getStats(): IRecognitionClientStats {
|
|
324
|
+
return this.client.getStats();
|
|
325
|
+
}
|
|
326
|
+
|
|
370
327
|
/**
|
|
371
328
|
* Set VGF recording status to RECORDING on the first audio chunk.
|
|
372
329
|
* Idempotent — subsequent calls are no-ops until disconnect/stop resets
|
|
@@ -490,16 +447,10 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
|
|
|
490
447
|
return { ...this.state };
|
|
491
448
|
}
|
|
492
449
|
|
|
493
|
-
private isTerminalStatus(status: string | undefined): boolean {
|
|
494
|
-
return status === TranscriptionStatus.FINALIZED ||
|
|
495
|
-
status === TranscriptionStatus.ABORTED ||
|
|
496
|
-
status === TranscriptionStatus.ERROR;
|
|
497
|
-
}
|
|
498
|
-
|
|
499
450
|
private notifyStateChange(): void {
|
|
500
451
|
|
|
501
452
|
// Block duplicate terminal status emissions for THIS session
|
|
502
|
-
if (
|
|
453
|
+
if (isTerminal(this.state)) {
|
|
503
454
|
if (this.lastSentTerminalUuid === this.expectedUuid) {
|
|
504
455
|
// Already sent a terminal status for this session - suppress duplicate
|
|
505
456
|
if (this.logger) {
|
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
*/
|
|
4
4
|
|
|
5
5
|
import {
|
|
6
|
+
mapTranscriptionResultToState,
|
|
7
|
+
mapSessionConfiguredToState,
|
|
6
8
|
resetRecognitionVGFState
|
|
7
9
|
} from './vgf-recognition-mapper.js';
|
|
8
10
|
import {
|
|
@@ -11,6 +13,13 @@ import {
|
|
|
11
13
|
TranscriptionStatus,
|
|
12
14
|
RecognitionActionProcessingState
|
|
13
15
|
} from './vgf-recognition-state.js';
|
|
16
|
+
import {
|
|
17
|
+
ASRApiType,
|
|
18
|
+
DetectionTypeV1,
|
|
19
|
+
RecognitionResultTypeV1,
|
|
20
|
+
SessionConfiguredV1,
|
|
21
|
+
TranscriptionResultV1
|
|
22
|
+
} from '@recog/shared-types';
|
|
14
23
|
|
|
15
24
|
describe('resetRecognitionVGFState', () => {
|
|
16
25
|
it('should generate a new UUID', () => {
|
|
@@ -75,4 +84,138 @@ describe('resetRecognitionVGFState', () => {
|
|
|
75
84
|
expect(newState.startRecordingStatus).toBe(RecordingStatus.READY);
|
|
76
85
|
expect(newState.recognitionActionProcessingState).toBe(RecognitionActionProcessingState.NOT_STARTED);
|
|
77
86
|
});
|
|
87
|
+
|
|
88
|
+
it('should preserve prompt input fields and clear detections', () => {
|
|
89
|
+
const originalState: RecognitionState = {
|
|
90
|
+
audioUtteranceId: 'old-uuid-123',
|
|
91
|
+
pendingTranscript: '',
|
|
92
|
+
promptSTT: 'hello,world',
|
|
93
|
+
promptSTF: 'map to play()',
|
|
94
|
+
promptTTF: 'extract title,artist',
|
|
95
|
+
detections: [
|
|
96
|
+
{ type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.92 }
|
|
97
|
+
]
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
const newState = resetRecognitionVGFState(originalState);
|
|
101
|
+
|
|
102
|
+
expect(newState.promptSTT).toBe('hello,world');
|
|
103
|
+
expect(newState.promptSTF).toBe('map to play()');
|
|
104
|
+
expect(newState.promptTTF).toBe('extract title,artist');
|
|
105
|
+
expect(newState.detections).toBeUndefined();
|
|
106
|
+
});
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
describe('mapTranscriptionResultToState detections', () => {
|
|
110
|
+
const baseState: RecognitionState = {
|
|
111
|
+
audioUtteranceId: 'utt-1',
|
|
112
|
+
pendingTranscript: ''
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
it('should copy detections from a pending transcript result', () => {
|
|
116
|
+
const result: TranscriptionResultV1 = {
|
|
117
|
+
type: RecognitionResultTypeV1.TRANSCRIPTION,
|
|
118
|
+
audioUtteranceId: 'utt-1',
|
|
119
|
+
finalTranscript: '',
|
|
120
|
+
finalTranscriptRaw: '',
|
|
121
|
+
pendingTranscript: 'one',
|
|
122
|
+
is_finished: false,
|
|
123
|
+
detections: [
|
|
124
|
+
{ type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
|
|
125
|
+
]
|
|
126
|
+
};
|
|
127
|
+
|
|
128
|
+
const newState = mapTranscriptionResultToState(baseState, result, true);
|
|
129
|
+
|
|
130
|
+
expect(newState.detections).toEqual([
|
|
131
|
+
{ type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
|
|
132
|
+
]);
|
|
133
|
+
});
|
|
134
|
+
|
|
135
|
+
it('should copy detections from a final transcript result', () => {
|
|
136
|
+
const result: TranscriptionResultV1 = {
|
|
137
|
+
type: RecognitionResultTypeV1.TRANSCRIPTION,
|
|
138
|
+
audioUtteranceId: 'utt-1',
|
|
139
|
+
finalTranscript: 'one time',
|
|
140
|
+
finalTranscriptRaw: 'one time',
|
|
141
|
+
is_finished: true,
|
|
142
|
+
detections: [
|
|
143
|
+
{ type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.93 }
|
|
144
|
+
]
|
|
145
|
+
};
|
|
146
|
+
|
|
147
|
+
const newState = mapTranscriptionResultToState(baseState, result, false);
|
|
148
|
+
|
|
149
|
+
expect(newState.detections).toHaveLength(1);
|
|
150
|
+
expect(newState.detections?.[0]?.query).toBe('one time');
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
it('should mirror SessionConfiguredV1 onto state and clear on reset', () => {
|
|
154
|
+
const sessionConfigured: SessionConfiguredV1 = {
|
|
155
|
+
type: RecognitionResultTypeV1.SESSION_CONFIGURED,
|
|
156
|
+
audioUtteranceId: 'utt-1',
|
|
157
|
+
provider: 'deepgram',
|
|
158
|
+
model: 'nova-3',
|
|
159
|
+
sampleRate: 16000,
|
|
160
|
+
encoding: 'linear16',
|
|
161
|
+
apiType: ASRApiType.STREAMING,
|
|
162
|
+
isFallback: false
|
|
163
|
+
};
|
|
164
|
+
const afterConfigured = mapSessionConfiguredToState(baseState, sessionConfigured);
|
|
165
|
+
expect(afterConfigured.sessionConfigured?.provider).toBe('deepgram');
|
|
166
|
+
expect(afterConfigured.sessionConfigured?.model).toBe('nova-3');
|
|
167
|
+
expect(afterConfigured.sessionConfigured?.isFallback).toBe(false);
|
|
168
|
+
|
|
169
|
+
const afterReset = resetRecognitionVGFState(afterConfigured);
|
|
170
|
+
expect(afterReset.sessionConfigured).toBeUndefined();
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
it('should copy accumulatedAudioTimeMs on pending and final transcripts and clear on reset', () => {
|
|
174
|
+
const pending: TranscriptionResultV1 = {
|
|
175
|
+
type: RecognitionResultTypeV1.TRANSCRIPTION,
|
|
176
|
+
audioUtteranceId: 'utt-1',
|
|
177
|
+
finalTranscript: '',
|
|
178
|
+
finalTranscriptRaw: '',
|
|
179
|
+
pendingTranscript: 'hi',
|
|
180
|
+
is_finished: false,
|
|
181
|
+
accumulatedAudioTimeMs: 1234
|
|
182
|
+
};
|
|
183
|
+
const afterPending = mapTranscriptionResultToState(baseState, pending, true);
|
|
184
|
+
expect(afterPending.accumulatedAudioTimeMs).toBe(1234);
|
|
185
|
+
|
|
186
|
+
const final: TranscriptionResultV1 = {
|
|
187
|
+
type: RecognitionResultTypeV1.TRANSCRIPTION,
|
|
188
|
+
audioUtteranceId: 'utt-1',
|
|
189
|
+
finalTranscript: 'hi',
|
|
190
|
+
finalTranscriptRaw: 'hi',
|
|
191
|
+
is_finished: true,
|
|
192
|
+
accumulatedAudioTimeMs: 5678
|
|
193
|
+
};
|
|
194
|
+
const afterFinal = mapTranscriptionResultToState(afterPending, final, false);
|
|
195
|
+
expect(afterFinal.accumulatedAudioTimeMs).toBe(5678);
|
|
196
|
+
|
|
197
|
+
const afterReset = resetRecognitionVGFState(afterFinal);
|
|
198
|
+
expect(afterReset.accumulatedAudioTimeMs).toBeUndefined();
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
it('should leave existing detections untouched when the result omits them', () => {
|
|
202
|
+
const stateWithDetections: RecognitionState = {
|
|
203
|
+
...baseState,
|
|
204
|
+
detections: [{ type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }]
|
|
205
|
+
};
|
|
206
|
+
const result: TranscriptionResultV1 = {
|
|
207
|
+
type: RecognitionResultTypeV1.TRANSCRIPTION,
|
|
208
|
+
audioUtteranceId: 'utt-1',
|
|
209
|
+
finalTranscript: '',
|
|
210
|
+
finalTranscriptRaw: '',
|
|
211
|
+
pendingTranscript: 'hello',
|
|
212
|
+
is_finished: false
|
|
213
|
+
};
|
|
214
|
+
|
|
215
|
+
const newState = mapTranscriptionResultToState(stateWithDetections, result, true);
|
|
216
|
+
|
|
217
|
+
expect(newState.detections).toEqual([
|
|
218
|
+
{ type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }
|
|
219
|
+
]);
|
|
220
|
+
});
|
|
78
221
|
});
|
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
} from './recognition-client.types.js';
|
|
20
20
|
import {
|
|
21
21
|
TranscriptionResultV1,
|
|
22
|
-
|
|
22
|
+
SessionConfiguredV1
|
|
23
23
|
} from '@recog/shared-types';
|
|
24
24
|
|
|
25
25
|
/**
|
|
@@ -94,6 +94,9 @@ export function mapTranscriptionResultToState(
|
|
|
94
94
|
if (result.lastNonSilence !== undefined) {
|
|
95
95
|
newState.lastNonSilence = result.lastNonSilence;
|
|
96
96
|
}
|
|
97
|
+
if (result.accumulatedAudioTimeMs !== undefined) {
|
|
98
|
+
newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
|
|
99
|
+
}
|
|
97
100
|
} else {
|
|
98
101
|
// Transcription is finished
|
|
99
102
|
newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
|
|
@@ -110,21 +113,46 @@ export function mapTranscriptionResultToState(
|
|
|
110
113
|
if (result.lastNonSilence !== undefined) {
|
|
111
114
|
newState.lastNonSilence = result.lastNonSilence;
|
|
112
115
|
}
|
|
116
|
+
if (result.accumulatedAudioTimeMs !== undefined) {
|
|
117
|
+
newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
|
|
118
|
+
}
|
|
113
119
|
|
|
114
120
|
// Clear pending when we have final
|
|
115
121
|
newState.pendingTranscript = "";
|
|
116
122
|
newState.pendingConfidence = undefined;
|
|
117
123
|
}
|
|
118
124
|
|
|
125
|
+
// Mirror provider-reported detections (currently Deepgram `search`; future entries
|
|
126
|
+
// may include keywords/keyterms/speech_contexts from other providers).
|
|
127
|
+
// Server pre-sorts by score descending, so [0] is the top hit — passed through as-is.
|
|
128
|
+
if (result.detections !== undefined) {
|
|
129
|
+
newState.detections = result.detections;
|
|
130
|
+
}
|
|
131
|
+
|
|
119
132
|
return newState;
|
|
120
133
|
}
|
|
121
134
|
|
|
135
|
+
/**
|
|
136
|
+
* Mirrors the SessionConfiguredV1 message onto the VGF state.
|
|
137
|
+
* Carries the resolved provider/model/sampleRate/encoding/apiType/isFallback
|
|
138
|
+
* that the server actually chose (after circuit-breaker/fallback). Fires once
|
|
139
|
+
* per session, before audio streaming begins.
|
|
140
|
+
*/
|
|
141
|
+
export function mapSessionConfiguredToState(
|
|
142
|
+
currentState: RecognitionState,
|
|
143
|
+
sessionConfigured: SessionConfiguredV1
|
|
144
|
+
): RecognitionState {
|
|
145
|
+
return {
|
|
146
|
+
...currentState,
|
|
147
|
+
sessionConfigured
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
|
|
122
151
|
/**
|
|
123
152
|
* Maps error to state
|
|
124
153
|
*/
|
|
125
154
|
export function mapErrorToState(
|
|
126
|
-
currentState: RecognitionState
|
|
127
|
-
error: ErrorResultV1
|
|
155
|
+
currentState: RecognitionState
|
|
128
156
|
): RecognitionState {
|
|
129
157
|
return {
|
|
130
158
|
...currentState,
|
|
@@ -185,7 +213,10 @@ export function resetRecognitionVGFState(currentState: RecognitionState): Recogn
|
|
|
185
213
|
recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
|
|
186
214
|
finalTranscript: undefined,
|
|
187
215
|
voiceEnd: undefined,
|
|
188
|
-
lastNonSilence: undefined
|
|
216
|
+
lastNonSilence: undefined,
|
|
217
|
+
accumulatedAudioTimeMs: undefined,
|
|
218
|
+
detections: undefined,
|
|
219
|
+
sessionConfigured: undefined
|
|
189
220
|
};
|
|
190
221
|
}
|
|
191
222
|
|
|
@@ -199,47 +230,6 @@ export function updateStateOnReady(currentState: RecognitionState): RecognitionS
|
|
|
199
230
|
};
|
|
200
231
|
}
|
|
201
232
|
|
|
202
|
-
/**
|
|
203
|
-
* Parses function call from transcript (STEP 3 support)
|
|
204
|
-
* This is a placeholder - actual implementation would use NLP/LLM
|
|
205
|
-
*/
|
|
206
|
-
export function extractFunctionCallFromTranscript(
|
|
207
|
-
transcript: string,
|
|
208
|
-
gameContext?: any
|
|
209
|
-
): { metadata?: string; confidence?: number } | null {
|
|
210
|
-
// This would be replaced with actual function call extraction logic
|
|
211
|
-
// For example, using an LLM to parse intent from the transcript
|
|
212
|
-
// and map it to game actions
|
|
213
|
-
|
|
214
|
-
// Example stub implementation:
|
|
215
|
-
const lowerTranscript = transcript.toLowerCase();
|
|
216
|
-
|
|
217
|
-
// Simple pattern matching for demo
|
|
218
|
-
if (lowerTranscript.includes("play") && lowerTranscript.includes("artist")) {
|
|
219
|
-
return {
|
|
220
|
-
metadata: JSON.stringify({ action: "play", target: "artist" }),
|
|
221
|
-
confidence: 0.8
|
|
222
|
-
};
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
return null;
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
/**
|
|
229
|
-
* Updates state with function call results (STEP 3)
|
|
230
|
-
*/
|
|
231
|
-
export function updateStateWithFunctionCall(
|
|
232
|
-
currentState: RecognitionState,
|
|
233
|
-
functionCall: { metadata?: string; confidence?: number }
|
|
234
|
-
): RecognitionState {
|
|
235
|
-
return {
|
|
236
|
-
...currentState,
|
|
237
|
-
functionCallMetadata: functionCall.metadata,
|
|
238
|
-
functionCallConfidence: functionCall.confidence,
|
|
239
|
-
finalFunctionCallTimestamp: new Date().toISOString()
|
|
240
|
-
};
|
|
241
|
-
}
|
|
242
|
-
|
|
243
233
|
// Helper function to generate UUID (simplified version)
|
|
244
234
|
function generateUUID(): string {
|
|
245
235
|
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { z } from "zod"
|
|
2
|
+
import { DetectionV1Schema, SessionConfiguredSchemaV1 } from "@recog/shared-types"
|
|
2
3
|
|
|
3
4
|
/**
|
|
4
5
|
* VGF-style state schema for game-side recognition state/results management.
|
|
@@ -25,9 +26,13 @@ export const RecognitionVGFStateSchema = z.object({
|
|
|
25
26
|
// Voice timing (ms from stream start, prefix-adjusted)
|
|
26
27
|
voiceEnd: z.number().optional(), // voice end time identified by ASR
|
|
27
28
|
lastNonSilence: z.number().optional(), // last non-silence sample time from PCM analysis
|
|
29
|
+
accumulatedAudioTimeMs: z.number().optional(), // total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
|
|
28
30
|
|
|
29
31
|
// Tracking-only metadata
|
|
30
|
-
asrConfig: z.string().optional(), // Json format of the ASR config
|
|
32
|
+
asrConfig: z.string().optional(), // Json format of the *requested* ASR config (set once at construction).
|
|
33
|
+
// For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
|
|
34
|
+
// server after circuit-breaker/fallback — see `sessionConfigured` below.
|
|
35
|
+
sessionConfigured: SessionConfiguredSchemaV1.optional(), // Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
|
|
31
36
|
startRecordingTimestamp: z.string().optional(), // Start of recording. Immutable after set.
|
|
32
37
|
finalRecordingTimestamp: z.string().optional(), // End of recording. Immutable after set. Transcription may still be in progress.
|
|
33
38
|
finalTranscriptionTimestamp: z.string().optional(), // When the final transcript was produced. Immutable after set.
|
|
@@ -41,9 +46,31 @@ export const RecognitionVGFStateSchema = z.object({
|
|
|
41
46
|
functionCallConfidence: z.number().optional(), // Confidence score for the function call.
|
|
42
47
|
finalFunctionCallTimestamp: z.string().optional(), // When the final action after interpreting the transcript was taken. Immutable.
|
|
43
48
|
|
|
49
|
+
// Session identity — when set, the VGF client backfills these into
|
|
50
|
+
// GameContextV1 if the caller didn't pass a `gameContext` in config.
|
|
51
|
+
// Lets RecognitionState be the single source of truth: server seeds
|
|
52
|
+
// `gameId` + `gamePhase` + `promptSlotMap` per player, controller passes
|
|
53
|
+
// the whole state as `initialState`, no separate `gameContext` needed.
|
|
54
|
+
// Backward-compatible: if `gameContext` is also passed in config, it wins.
|
|
55
|
+
gameId: z.string().optional(),
|
|
56
|
+
gamePhase: z.string().optional(),
|
|
57
|
+
|
|
44
58
|
// Support for prompt slot mapping - passed to recognition context when present
|
|
45
59
|
promptSlotMap: z.record(z.string(), z.array(z.string())).optional(), // Optional map of slot names to prompt values for recognition context
|
|
46
60
|
|
|
61
|
+
// Optional prompt inputs - when set, forwarded into GameContext at client creation.
|
|
62
|
+
// Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
|
|
63
|
+
promptSTT: z.string().optional(),
|
|
64
|
+
promptSTF: z.string().optional(),
|
|
65
|
+
promptTTF: z.string().optional(),
|
|
66
|
+
|
|
67
|
+
// Provider-reported phrase detections from the last transcript message.
|
|
68
|
+
// Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
|
|
69
|
+
// (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
|
|
70
|
+
// Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
|
|
71
|
+
// and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
|
|
72
|
+
detections: z.array(DetectionV1Schema).optional(),
|
|
73
|
+
|
|
47
74
|
// Recognition action processing state - managed externally, SDK preserves but never modifies
|
|
48
75
|
recognitionActionProcessingState: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
|
|
49
76
|
})
|
|
@@ -89,6 +116,22 @@ export function createInitialRecognitionState(audioUtteranceId: string): Recogni
|
|
|
89
116
|
}
|
|
90
117
|
}
|
|
91
118
|
|
|
119
|
+
// Helper for "session ended — no more state updates coming".
|
|
120
|
+
// Terminal states are FINALIZED (clean end with transcript), ABORTED
|
|
121
|
+
// (user cancelled), and ERROR (something failed). Use this anywhere
|
|
122
|
+
// you forward RecognitionState updates to a server thunk: persist
|
|
123
|
+
// every update, but only do cleanup / observability / scoring work
|
|
124
|
+
// after isTerminal(state) is true. Branch on `transcriptionStatus`
|
|
125
|
+
// when the action depends on *which* terminal state (e.g. only score
|
|
126
|
+
// on FINALIZED, only emit "cancelled" telemetry on ABORTED).
|
|
127
|
+
export function isTerminal(state: Pick<RecognitionState, "transcriptionStatus">): boolean {
|
|
128
|
+
return (
|
|
129
|
+
state.transcriptionStatus === TranscriptionStatus.FINALIZED ||
|
|
130
|
+
state.transcriptionStatus === TranscriptionStatus.ABORTED ||
|
|
131
|
+
state.transcriptionStatus === TranscriptionStatus.ERROR
|
|
132
|
+
)
|
|
133
|
+
}
|
|
134
|
+
|
|
92
135
|
// Helper function to validate state transitions
|
|
93
136
|
export function isValidRecordingStatusTransition(from: string | undefined, to: string): boolean {
|
|
94
137
|
const statusOrder = [
|