@volley/recognition-client-sdk 0.1.799 → 0.1.803

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,22 +12,25 @@ import {
12
12
  RecognitionState,
13
13
  TranscriptionStatus,
14
14
  RecordingStatus,
15
- RecognitionActionProcessingState
15
+ RecognitionActionProcessingState,
16
+ isTerminal
16
17
  } from './vgf-recognition-state.js';
17
18
  import {
18
19
  IRecognitionClient,
19
20
  IRecognitionClientConfig,
21
+ IRecognitionClientStats,
20
22
  ClientState
21
23
  } from './recognition-client.types.js';
22
24
  import { RealTimeTwoWayWebSocketRecognitionClient } from './recognition-client.js';
23
25
  import {
24
26
  createVGFStateFromConfig,
25
27
  mapTranscriptionResultToState,
28
+ mapSessionConfiguredToState,
26
29
  mapErrorToState,
27
30
  updateStateOnStop,
28
31
  resetRecognitionVGFState
29
32
  } from './vgf-recognition-mapper.js';
30
- import type { GameContextV1 } from '@recog/shared-types';
33
+ import { RecognitionContextTypeV1, type GameContextV1 } from '@recog/shared-types';
31
34
 
32
35
  /**
33
36
  * Configuration for SimplifiedVGFRecognitionClient
@@ -49,126 +52,22 @@ export interface SimplifiedVGFClientConfig extends IRecognitionClientConfig {
49
52
  /**
50
53
  * Interface for SimplifiedVGFRecognitionClient
51
54
  *
52
- * A simplified client that maintains VGF state for game developers.
53
- * All methods from the underlying client are available, plus VGF state management.
55
+ * Inherits the full IRecognitionClient surface (connect, sendAudio,
56
+ * sendAudioWithSampleRate, sendPrefixAudio, stopRecording, stopAbnormally,
57
+ * status checks, sendGameContext, getStats, getUrl, getState, getAudioUtteranceId)
58
+ * — see recognition-client.types.ts for those. Adds VGF-specific state access.
59
+ *
60
+ * Extending IRecognitionClient (rather than redeclaring methods) means
61
+ * TypeScript catches any base-client method that's not delegated by the
62
+ * VGF wrapper at compile time — keeps the two surfaces in sync.
54
63
  */
55
- export interface ISimplifiedVGFRecognitionClient {
56
- // ============= Core Connection Methods =============
57
- /**
58
- * Connect to the recognition service WebSocket
59
- * @returns Promise that resolves when connected and ready
60
- */
61
- connect(): Promise<void>;
62
-
63
- /**
64
- * Send audio data for transcription
65
- * @param audioData - PCM audio data as ArrayBuffer, typed array, or Blob
66
- */
67
- sendAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void;
68
-
69
- /**
70
- * Send PCM16 mono audio captured at `sourceSampleRate`; the SDK
71
- * downsamples to the session's target rate before transmitting. Use
72
- * when capture is at the system's native rate (browser AudioContext is
73
- * typically 44.1 kHz or 48 kHz). Audio must be signed 16-bit
74
- * little-endian PCM, mono.
75
- */
76
- sendAudioWithSampleRate(
77
- audioData: ArrayBuffer | ArrayBufferView | Blob,
78
- sourceSampleRate: number
79
- ): void;
80
-
81
- /**
82
- * Stop recording and wait for final transcription
83
- * @returns Promise that resolves when transcription is complete
84
- */
85
- stopRecording(): Promise<void>;
86
-
64
+ export interface ISimplifiedVGFRecognitionClient extends IRecognitionClient {
87
65
  /**
88
- * Force stop and immediately close connection without waiting for server
89
- *
90
- * WARNING: This is an abnormal shutdown that bypasses the graceful stop flow:
91
- * - Does NOT wait for server to process remaining audio
92
- * - Does NOT receive final transcript from server (VGF state set to empty)
93
- * - Immediately closes WebSocket connection
94
- * - Cleans up resources (buffers, listeners)
95
- *
96
- * Use Cases:
97
- * - User explicitly cancels/abandons the session
98
- * - Timeout scenarios where waiting is not acceptable
99
- * - Need immediate cleanup and can't wait for server
100
- *
101
- * RECOMMENDED: Use stopRecording() for normal shutdown.
102
- * Only use this when immediate disconnection is required.
103
- */
104
- stopAbnormally(): void;
105
-
106
- // ============= VGF State Methods =============
107
- /**
108
- * Get the current VGF recognition state
66
+ * Get the current VGF recognition state the single shared store
67
+ * of inputs and outputs for this utterance.
109
68
  * @returns Current RecognitionState with all transcription data
110
69
  */
111
70
  getVGFState(): RecognitionState;
112
-
113
- // ============= Status Check Methods =============
114
- /**
115
- * Check if connected to the WebSocket
116
- */
117
- isConnected(): boolean;
118
-
119
- /**
120
- * Check if currently connecting
121
- */
122
- isConnecting(): boolean;
123
-
124
- /**
125
- * Check if currently stopping
126
- */
127
- isStopping(): boolean;
128
-
129
- /**
130
- * Check if transcription has finished
131
- */
132
- isTranscriptionFinished(): boolean;
133
-
134
- /**
135
- * Check if the audio buffer has overflowed
136
- */
137
- isBufferOverflowing(): boolean;
138
-
139
- // ============= Preconnect Methods =============
140
- /**
141
- * Send game context after connection is established (for preconnect flow).
142
- *
143
- * Preconnect flow: Create client with asrRequestConfig (useContext: true) but
144
- * WITHOUT gameContext → call connect() → later call sendGameContext() with slotMap.
145
- *
146
- * @param context - Game context including slotMap for keyword boosting
147
- */
148
- sendGameContext(context: GameContextV1): void;
149
-
150
- /**
151
- * Check if server has sent READY signal (provider connected, ready for audio).
152
- * In preconnect flow, this becomes true after sendGameContext() triggers provider attachment.
153
- */
154
- isServerReady(): boolean;
155
-
156
- // ============= Utility Methods =============
157
- /**
158
- * Get the audio utterance ID for this session
159
- */
160
- getAudioUtteranceId(): string;
161
-
162
- /**
163
- * Get the WebSocket URL being used
164
- */
165
- getUrl(): string;
166
-
167
- /**
168
- * Get the underlying client state (for advanced usage)
169
- */
170
- getState(): ClientState;
171
-
172
71
  }
173
72
 
174
73
  /**
@@ -241,23 +140,51 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
241
140
  // Track the expected UUID for this session
242
141
  this.expectedUuid = this.state.audioUtteranceId;
243
142
 
244
- // If VGF state has promptSlotMap, configure gameContext to use it
245
- if (this.state.promptSlotMap) {
143
+ // Backfill gameContext from state if the caller didn't pass one in config.
144
+ // Lets RecognitionState carry session identity (gameId + gamePhase) alongside
145
+ // runtime prompt inputs (promptSlotMap, promptSTT/STF/TTF), so a single
146
+ // initialState object is sufficient — no separate gameContext required.
147
+ // Backward-compatible: if both are provided, clientConfig.gameContext wins.
148
+ if (!clientConfig.gameContext && this.state.gameId && this.state.gamePhase) {
149
+ clientConfig.gameContext = {
150
+ type: RecognitionContextTypeV1.GAME_CONTEXT,
151
+ gameId: this.state.gameId,
152
+ gamePhase: this.state.gamePhase,
153
+ };
154
+ }
155
+
156
+ // Forward optional VGF inputs (promptSlotMap, promptSTT/STF/TTF) into the GameContext.
157
+ const hasPromptInputs =
158
+ this.state.promptSlotMap !== undefined ||
159
+ this.state.promptSTT !== undefined ||
160
+ this.state.promptSTF !== undefined ||
161
+ this.state.promptTTF !== undefined;
162
+
163
+ if (hasPromptInputs) {
246
164
  // Set useContext=true in ASR config to enable context processing
247
165
  if (clientConfig.asrRequestConfig) {
248
166
  clientConfig.asrRequestConfig.useContext = true;
249
167
  }
250
168
 
251
- // Add promptSlotMap to gameContext
252
169
  if (!clientConfig.gameContext) {
253
- // Only create gameContext if we have gameId and gamePhase
254
- // These should come from the game's configuration
170
+ // No gameContext from config, and state didn't carry gameId+gamePhase
171
+ // either prompt inputs have nowhere to ride. Warn and drop.
255
172
  if (clientConfig.logger) {
256
- clientConfig.logger('warn', '[VGF] promptSlotMap found but no gameContext provided. SlotMap will not be sent.');
173
+ clientConfig.logger('warn', '[VGF] prompt inputs found but no gameContext provided and state has no gameId/gamePhase. They will not be sent.');
257
174
  }
258
175
  } else {
259
- // Merge promptSlotMap into existing gameContext
260
- clientConfig.gameContext.slotMap = this.state.promptSlotMap;
176
+ if (this.state.promptSlotMap !== undefined) {
177
+ clientConfig.gameContext.slotMap = this.state.promptSlotMap;
178
+ }
179
+ if (this.state.promptSTT !== undefined) {
180
+ clientConfig.gameContext.promptSTT = this.state.promptSTT;
181
+ }
182
+ if (this.state.promptSTF !== undefined) {
183
+ clientConfig.gameContext.promptSTF = this.state.promptSTF;
184
+ }
185
+ if (this.state.promptTTF !== undefined) {
186
+ clientConfig.gameContext.promptTTF = this.state.promptTTF;
187
+ }
261
188
  }
262
189
  }
263
190
 
@@ -303,6 +230,25 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
303
230
  }
304
231
  },
305
232
 
233
+ onSessionConfigured: (sessionConfigured): void => {
234
+ // Skip update if UUID doesn't match (stale callback from previous session)
235
+ if (sessionConfigured.audioUtteranceId && sessionConfigured.audioUtteranceId !== this.expectedUuid) {
236
+ if (this.logger) {
237
+ this.logger('warn',
238
+ `[RecogSDK:VGF] Skipping sessionConfigured update: UUID mismatch (expected: ${this.expectedUuid}, got: ${sessionConfigured.audioUtteranceId})`
239
+ );
240
+ }
241
+ return;
242
+ }
243
+
244
+ this.state = mapSessionConfiguredToState(this.state, sessionConfigured);
245
+ this.notifyStateChange();
246
+
247
+ if (clientConfig.onSessionConfigured) {
248
+ clientConfig.onSessionConfigured(sessionConfigured);
249
+ }
250
+ },
251
+
306
252
  onFunctionCall: (result): void => {
307
253
  // Pass through function call - no VGF state changes needed for P2 feature
308
254
  if (clientConfig.onFunctionCall) {
@@ -322,7 +268,7 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
322
268
  }
323
269
 
324
270
  this.isRecordingAudio = false; // Reset on error
325
- this.state = mapErrorToState(this.state, error);
271
+ this.state = mapErrorToState(this.state);
326
272
  this.notifyStateChange();
327
273
 
328
274
  if (clientConfig.onError) {
@@ -367,6 +313,17 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
367
313
  this.client.sendAudioWithSampleRate(audioData, sourceSampleRate);
368
314
  }
369
315
 
316
+ sendPrefixAudio(audioData: ArrayBuffer | ArrayBufferView | Blob): void {
317
+ // Pure pass-through. Prefix audio is NOT user audio, so do not flip
318
+ // startRecordingStatus to RECORDING — that transition belongs to the
319
+ // first sendAudio() call.
320
+ this.client.sendPrefixAudio(audioData);
321
+ }
322
+
323
+ getStats(): IRecognitionClientStats {
324
+ return this.client.getStats();
325
+ }
326
+
370
327
  /**
371
328
  * Set VGF recording status to RECORDING on the first audio chunk.
372
329
  * Idempotent — subsequent calls are no-ops until disconnect/stop resets
@@ -490,16 +447,10 @@ export class SimplifiedVGFRecognitionClient implements ISimplifiedVGFRecognition
490
447
  return { ...this.state };
491
448
  }
492
449
 
493
- private isTerminalStatus(status: string | undefined): boolean {
494
- return status === TranscriptionStatus.FINALIZED ||
495
- status === TranscriptionStatus.ABORTED ||
496
- status === TranscriptionStatus.ERROR;
497
- }
498
-
499
450
  private notifyStateChange(): void {
500
451
 
501
452
  // Block duplicate terminal status emissions for THIS session
502
- if (this.isTerminalStatus(this.state.transcriptionStatus)) {
453
+ if (isTerminal(this.state)) {
503
454
  if (this.lastSentTerminalUuid === this.expectedUuid) {
504
455
  // Already sent a terminal status for this session - suppress duplicate
505
456
  if (this.logger) {
@@ -3,6 +3,8 @@
3
3
  */
4
4
 
5
5
  import {
6
+ mapTranscriptionResultToState,
7
+ mapSessionConfiguredToState,
6
8
  resetRecognitionVGFState
7
9
  } from './vgf-recognition-mapper.js';
8
10
  import {
@@ -11,6 +13,13 @@ import {
11
13
  TranscriptionStatus,
12
14
  RecognitionActionProcessingState
13
15
  } from './vgf-recognition-state.js';
16
+ import {
17
+ ASRApiType,
18
+ DetectionTypeV1,
19
+ RecognitionResultTypeV1,
20
+ SessionConfiguredV1,
21
+ TranscriptionResultV1
22
+ } from '@recog/shared-types';
14
23
 
15
24
  describe('resetRecognitionVGFState', () => {
16
25
  it('should generate a new UUID', () => {
@@ -75,4 +84,138 @@ describe('resetRecognitionVGFState', () => {
75
84
  expect(newState.startRecordingStatus).toBe(RecordingStatus.READY);
76
85
  expect(newState.recognitionActionProcessingState).toBe(RecognitionActionProcessingState.NOT_STARTED);
77
86
  });
87
+
88
+ it('should preserve prompt input fields and clear detections', () => {
89
+ const originalState: RecognitionState = {
90
+ audioUtteranceId: 'old-uuid-123',
91
+ pendingTranscript: '',
92
+ promptSTT: 'hello,world',
93
+ promptSTF: 'map to play()',
94
+ promptTTF: 'extract title,artist',
95
+ detections: [
96
+ { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.92 }
97
+ ]
98
+ };
99
+
100
+ const newState = resetRecognitionVGFState(originalState);
101
+
102
+ expect(newState.promptSTT).toBe('hello,world');
103
+ expect(newState.promptSTF).toBe('map to play()');
104
+ expect(newState.promptTTF).toBe('extract title,artist');
105
+ expect(newState.detections).toBeUndefined();
106
+ });
107
+ });
108
+
109
+ describe('mapTranscriptionResultToState detections', () => {
110
+ const baseState: RecognitionState = {
111
+ audioUtteranceId: 'utt-1',
112
+ pendingTranscript: ''
113
+ };
114
+
115
+ it('should copy detections from a pending transcript result', () => {
116
+ const result: TranscriptionResultV1 = {
117
+ type: RecognitionResultTypeV1.TRANSCRIPTION,
118
+ audioUtteranceId: 'utt-1',
119
+ finalTranscript: '',
120
+ finalTranscriptRaw: '',
121
+ pendingTranscript: 'one',
122
+ is_finished: false,
123
+ detections: [
124
+ { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
125
+ ]
126
+ };
127
+
128
+ const newState = mapTranscriptionResultToState(baseState, result, true);
129
+
130
+ expect(newState.detections).toEqual([
131
+ { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.81, startMs: 100, endMs: 900 }
132
+ ]);
133
+ });
134
+
135
+ it('should copy detections from a final transcript result', () => {
136
+ const result: TranscriptionResultV1 = {
137
+ type: RecognitionResultTypeV1.TRANSCRIPTION,
138
+ audioUtteranceId: 'utt-1',
139
+ finalTranscript: 'one time',
140
+ finalTranscriptRaw: 'one time',
141
+ is_finished: true,
142
+ detections: [
143
+ { type: DetectionTypeV1.SEARCH, query: 'one time', score: 0.93 }
144
+ ]
145
+ };
146
+
147
+ const newState = mapTranscriptionResultToState(baseState, result, false);
148
+
149
+ expect(newState.detections).toHaveLength(1);
150
+ expect(newState.detections?.[0]?.query).toBe('one time');
151
+ });
152
+
153
+ it('should mirror SessionConfiguredV1 onto state and clear on reset', () => {
154
+ const sessionConfigured: SessionConfiguredV1 = {
155
+ type: RecognitionResultTypeV1.SESSION_CONFIGURED,
156
+ audioUtteranceId: 'utt-1',
157
+ provider: 'deepgram',
158
+ model: 'nova-3',
159
+ sampleRate: 16000,
160
+ encoding: 'linear16',
161
+ apiType: ASRApiType.STREAMING,
162
+ isFallback: false
163
+ };
164
+ const afterConfigured = mapSessionConfiguredToState(baseState, sessionConfigured);
165
+ expect(afterConfigured.sessionConfigured?.provider).toBe('deepgram');
166
+ expect(afterConfigured.sessionConfigured?.model).toBe('nova-3');
167
+ expect(afterConfigured.sessionConfigured?.isFallback).toBe(false);
168
+
169
+ const afterReset = resetRecognitionVGFState(afterConfigured);
170
+ expect(afterReset.sessionConfigured).toBeUndefined();
171
+ });
172
+
173
+ it('should copy accumulatedAudioTimeMs on pending and final transcripts and clear on reset', () => {
174
+ const pending: TranscriptionResultV1 = {
175
+ type: RecognitionResultTypeV1.TRANSCRIPTION,
176
+ audioUtteranceId: 'utt-1',
177
+ finalTranscript: '',
178
+ finalTranscriptRaw: '',
179
+ pendingTranscript: 'hi',
180
+ is_finished: false,
181
+ accumulatedAudioTimeMs: 1234
182
+ };
183
+ const afterPending = mapTranscriptionResultToState(baseState, pending, true);
184
+ expect(afterPending.accumulatedAudioTimeMs).toBe(1234);
185
+
186
+ const final: TranscriptionResultV1 = {
187
+ type: RecognitionResultTypeV1.TRANSCRIPTION,
188
+ audioUtteranceId: 'utt-1',
189
+ finalTranscript: 'hi',
190
+ finalTranscriptRaw: 'hi',
191
+ is_finished: true,
192
+ accumulatedAudioTimeMs: 5678
193
+ };
194
+ const afterFinal = mapTranscriptionResultToState(afterPending, final, false);
195
+ expect(afterFinal.accumulatedAudioTimeMs).toBe(5678);
196
+
197
+ const afterReset = resetRecognitionVGFState(afterFinal);
198
+ expect(afterReset.accumulatedAudioTimeMs).toBeUndefined();
199
+ });
200
+
201
+ it('should leave existing detections untouched when the result omits them', () => {
202
+ const stateWithDetections: RecognitionState = {
203
+ ...baseState,
204
+ detections: [{ type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }]
205
+ };
206
+ const result: TranscriptionResultV1 = {
207
+ type: RecognitionResultTypeV1.TRANSCRIPTION,
208
+ audioUtteranceId: 'utt-1',
209
+ finalTranscript: '',
210
+ finalTranscriptRaw: '',
211
+ pendingTranscript: 'hello',
212
+ is_finished: false
213
+ };
214
+
215
+ const newState = mapTranscriptionResultToState(stateWithDetections, result, true);
216
+
217
+ expect(newState.detections).toEqual([
218
+ { type: DetectionTypeV1.SEARCH, query: 'prev', score: 0.5 }
219
+ ]);
220
+ });
78
221
  });
@@ -19,7 +19,7 @@ import {
19
19
  } from './recognition-client.types.js';
20
20
  import {
21
21
  TranscriptionResultV1,
22
- ErrorResultV1
22
+ SessionConfiguredV1
23
23
  } from '@recog/shared-types';
24
24
 
25
25
  /**
@@ -94,6 +94,9 @@ export function mapTranscriptionResultToState(
94
94
  if (result.lastNonSilence !== undefined) {
95
95
  newState.lastNonSilence = result.lastNonSilence;
96
96
  }
97
+ if (result.accumulatedAudioTimeMs !== undefined) {
98
+ newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
99
+ }
97
100
  } else {
98
101
  // Transcription is finished
99
102
  newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
@@ -110,21 +113,46 @@ export function mapTranscriptionResultToState(
110
113
  if (result.lastNonSilence !== undefined) {
111
114
  newState.lastNonSilence = result.lastNonSilence;
112
115
  }
116
+ if (result.accumulatedAudioTimeMs !== undefined) {
117
+ newState.accumulatedAudioTimeMs = result.accumulatedAudioTimeMs;
118
+ }
113
119
 
114
120
  // Clear pending when we have final
115
121
  newState.pendingTranscript = "";
116
122
  newState.pendingConfidence = undefined;
117
123
  }
118
124
 
125
+ // Mirror provider-reported detections (currently Deepgram `search`; future entries
126
+ // may include keywords/keyterms/speech_contexts from other providers).
127
+ // Server pre-sorts by score descending, so [0] is the top hit — passed through as-is.
128
+ if (result.detections !== undefined) {
129
+ newState.detections = result.detections;
130
+ }
131
+
119
132
  return newState;
120
133
  }
121
134
 
135
+ /**
136
+ * Mirrors the SessionConfiguredV1 message onto the VGF state.
137
+ * Carries the resolved provider/model/sampleRate/encoding/apiType/isFallback
138
+ * that the server actually chose (after circuit-breaker/fallback). Fires once
139
+ * per session, before audio streaming begins.
140
+ */
141
+ export function mapSessionConfiguredToState(
142
+ currentState: RecognitionState,
143
+ sessionConfigured: SessionConfiguredV1
144
+ ): RecognitionState {
145
+ return {
146
+ ...currentState,
147
+ sessionConfigured
148
+ };
149
+ }
150
+
122
151
  /**
123
152
  * Maps error to state
124
153
  */
125
154
  export function mapErrorToState(
126
- currentState: RecognitionState,
127
- error: ErrorResultV1
155
+ currentState: RecognitionState
128
156
  ): RecognitionState {
129
157
  return {
130
158
  ...currentState,
@@ -185,7 +213,10 @@ export function resetRecognitionVGFState(currentState: RecognitionState): Recogn
185
213
  recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
186
214
  finalTranscript: undefined,
187
215
  voiceEnd: undefined,
188
- lastNonSilence: undefined
216
+ lastNonSilence: undefined,
217
+ accumulatedAudioTimeMs: undefined,
218
+ detections: undefined,
219
+ sessionConfigured: undefined
189
220
  };
190
221
  }
191
222
 
@@ -199,47 +230,6 @@ export function updateStateOnReady(currentState: RecognitionState): RecognitionS
199
230
  };
200
231
  }
201
232
 
202
- /**
203
- * Parses function call from transcript (STEP 3 support)
204
- * This is a placeholder - actual implementation would use NLP/LLM
205
- */
206
- export function extractFunctionCallFromTranscript(
207
- transcript: string,
208
- gameContext?: any
209
- ): { metadata?: string; confidence?: number } | null {
210
- // This would be replaced with actual function call extraction logic
211
- // For example, using an LLM to parse intent from the transcript
212
- // and map it to game actions
213
-
214
- // Example stub implementation:
215
- const lowerTranscript = transcript.toLowerCase();
216
-
217
- // Simple pattern matching for demo
218
- if (lowerTranscript.includes("play") && lowerTranscript.includes("artist")) {
219
- return {
220
- metadata: JSON.stringify({ action: "play", target: "artist" }),
221
- confidence: 0.8
222
- };
223
- }
224
-
225
- return null;
226
- }
227
-
228
- /**
229
- * Updates state with function call results (STEP 3)
230
- */
231
- export function updateStateWithFunctionCall(
232
- currentState: RecognitionState,
233
- functionCall: { metadata?: string; confidence?: number }
234
- ): RecognitionState {
235
- return {
236
- ...currentState,
237
- functionCallMetadata: functionCall.metadata,
238
- functionCallConfidence: functionCall.confidence,
239
- finalFunctionCallTimestamp: new Date().toISOString()
240
- };
241
- }
242
-
243
233
  // Helper function to generate UUID (simplified version)
244
234
  function generateUUID(): string {
245
235
  return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
@@ -1,4 +1,5 @@
1
1
  import { z } from "zod"
2
+ import { DetectionV1Schema, SessionConfiguredSchemaV1 } from "@recog/shared-types"
2
3
 
3
4
  /**
4
5
  * VGF-style state schema for game-side recognition state/results management.
@@ -25,9 +26,13 @@ export const RecognitionVGFStateSchema = z.object({
25
26
  // Voice timing (ms from stream start, prefix-adjusted)
26
27
  voiceEnd: z.number().optional(), // voice end time identified by ASR
27
28
  lastNonSilence: z.number().optional(), // last non-silence sample time from PCM analysis
29
+ accumulatedAudioTimeMs: z.number().optional(), // total user audio time watermark (ms) — mirrors TranscriptionResultV1.accumulatedAudioTimeMs
28
30
 
29
31
  // Tracking-only metadata
30
- asrConfig: z.string().optional(), // Json format of the ASR config
32
+ asrConfig: z.string().optional(), // Json format of the *requested* ASR config (set once at construction).
33
+ // For the *resolved* truth — actual provider/model/sampleRate/encoding/apiType/isFallback chosen by the
34
+ // server after circuit-breaker/fallback — see `sessionConfigured` below.
35
+ sessionConfigured: SessionConfiguredSchemaV1.optional(), // Mirrors the SessionConfiguredV1 message; populated when the server emits it (before audio streams).
31
36
  startRecordingTimestamp: z.string().optional(), // Start of recording. Immutable after set.
32
37
  finalRecordingTimestamp: z.string().optional(), // End of recording. Immutable after set. Transcription may still be in progress.
33
38
  finalTranscriptionTimestamp: z.string().optional(), // When the final transcript was produced. Immutable after set.
@@ -41,9 +46,31 @@ export const RecognitionVGFStateSchema = z.object({
41
46
  functionCallConfidence: z.number().optional(), // Confidence score for the function call.
42
47
  finalFunctionCallTimestamp: z.string().optional(), // When the final action after interpreting the transcript was taken. Immutable.
43
48
 
49
+ // Session identity — when set, the VGF client backfills these into
50
+ // GameContextV1 if the caller didn't pass a `gameContext` in config.
51
+ // Lets RecognitionState be the single source of truth: server seeds
52
+ // `gameId` + `gamePhase` + `promptSlotMap` per player, controller passes
53
+ // the whole state as `initialState`, no separate `gameContext` needed.
54
+ // Backward-compatible: if `gameContext` is also passed in config, it wins.
55
+ gameId: z.string().optional(),
56
+ gamePhase: z.string().optional(),
57
+
44
58
  // Support for prompt slot mapping - passed to recognition context when present
45
59
  promptSlotMap: z.record(z.string(), z.array(z.string())).optional(), // Optional map of slot names to prompt values for recognition context
46
60
 
61
+ // Optional prompt inputs - when set, forwarded into GameContext at client creation.
62
+ // Mirror the GameContextV1 fields: STT (ASR keywords/keyterms), STF (speech->function), TTF (text->function).
63
+ promptSTT: z.string().optional(),
64
+ promptSTF: z.string().optional(),
65
+ promptTTF: z.string().optional(),
66
+
67
+ // Provider-reported phrase detections from the last transcript message.
68
+ // Mirrors TranscriptionResultV1.detections — a heterogeneous list keyed by DetectionTypeV1
69
+ // (today only 'search' from Deepgram; future entries may include keywords/keyterms/speech_contexts).
70
+ // Sorted by `score` descending by the server (see deepgram/message-handlers/v1/transform-transcript.ts
71
+ // and provider-to-recognition-transformer.ts), so [0] is the top hit — no client-side re-rank needed.
72
+ detections: z.array(DetectionV1Schema).optional(),
73
+
47
74
  // Recognition action processing state - managed externally, SDK preserves but never modifies
48
75
  recognitionActionProcessingState: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
49
76
  })
@@ -89,6 +116,22 @@ export function createInitialRecognitionState(audioUtteranceId: string): Recogni
89
116
  }
90
117
  }
91
118
 
119
+ // Helper for "session ended — no more state updates coming".
120
+ // Terminal states are FINALIZED (clean end with transcript), ABORTED
121
+ // (user cancelled), and ERROR (something failed). Use this anywhere
122
+ // you forward RecognitionState updates to a server thunk: persist
123
+ // every update, but only do cleanup / observability / scoring work
124
+ // after isTerminal(state) is true. Branch on `transcriptionStatus`
125
+ // when the action depends on *which* terminal state (e.g. only score
126
+ // on FINALIZED, only emit "cancelled" telemetry on ABORTED).
127
+ export function isTerminal(state: Pick<RecognitionState, "transcriptionStatus">): boolean {
128
+ return (
129
+ state.transcriptionStatus === TranscriptionStatus.FINALIZED ||
130
+ state.transcriptionStatus === TranscriptionStatus.ABORTED ||
131
+ state.transcriptionStatus === TranscriptionStatus.ERROR
132
+ )
133
+ }
134
+
92
135
  // Helper function to validate state transitions
93
136
  export function isValidRecordingStatusTransition(from: string | undefined, to: string): boolean {
94
137
  const statusOrder = [