@volley/recognition-client-sdk 0.1.200
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +168 -0
- package/dist/browser-CDQ_TzeH.d.ts +1039 -0
- package/dist/index.d.ts +461 -0
- package/dist/index.js +2332 -0
- package/dist/index.js.map +1 -0
- package/dist/recog-client-sdk.browser.d.ts +2 -0
- package/dist/recog-client-sdk.browser.js +1843 -0
- package/dist/recog-client-sdk.browser.js.map +1 -0
- package/package.json +73 -0
- package/src/browser.ts +24 -0
- package/src/config-builder.ts +213 -0
- package/src/factory.ts +43 -0
- package/src/index.ts +86 -0
- package/src/recognition-client.spec.ts +551 -0
- package/src/recognition-client.ts +595 -0
- package/src/recognition-client.types.ts +260 -0
- package/src/simplified-vgf-recognition-client.spec.ts +671 -0
- package/src/simplified-vgf-recognition-client.ts +339 -0
- package/src/utils/audio-ring-buffer.ts +170 -0
- package/src/utils/message-handler.ts +131 -0
- package/src/utils/url-builder.ts +70 -0
- package/src/vgf-recognition-mapper.ts +225 -0
- package/src/vgf-recognition-state.ts +89 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VGF Recognition Mapper
|
|
3
|
+
*
|
|
4
|
+
* Maps between the existing recognition client types and the simplified VGF state.
|
|
5
|
+
* This provides a clean abstraction layer for game developers.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import {
|
|
9
|
+
RecognitionState,
|
|
10
|
+
RecordingStatus,
|
|
11
|
+
TranscriptionStatus,
|
|
12
|
+
createInitialRecognitionState
|
|
13
|
+
} from './vgf-recognition-state.js';
|
|
14
|
+
import {
|
|
15
|
+
ClientState,
|
|
16
|
+
IRecognitionClientConfig
|
|
17
|
+
} from './recognition-client.types.js';
|
|
18
|
+
import {
|
|
19
|
+
TranscriptionResultV1,
|
|
20
|
+
MetadataResultV1,
|
|
21
|
+
ErrorResultV1,
|
|
22
|
+
ASRRequestConfig
|
|
23
|
+
} from '@recog/shared-types';
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Maps ClientState to RecordingStatus for VGF state
|
|
27
|
+
*/
|
|
28
|
+
export function mapClientStateToRecordingStatus(clientState: ClientState): string {
|
|
29
|
+
switch (clientState) {
|
|
30
|
+
case ClientState.INITIAL:
|
|
31
|
+
case ClientState.CONNECTING:
|
|
32
|
+
case ClientState.CONNECTED:
|
|
33
|
+
return RecordingStatus.NOT_READY;
|
|
34
|
+
|
|
35
|
+
case ClientState.READY:
|
|
36
|
+
// Ready to record, but not recording yet
|
|
37
|
+
return RecordingStatus.READY;
|
|
38
|
+
|
|
39
|
+
case ClientState.STOPPING:
|
|
40
|
+
case ClientState.STOPPED:
|
|
41
|
+
case ClientState.FAILED:
|
|
42
|
+
return RecordingStatus.FINISHED;
|
|
43
|
+
|
|
44
|
+
default:
|
|
45
|
+
return RecordingStatus.NOT_READY;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Creates a VGF state from transcription result
|
|
51
|
+
*/
|
|
52
|
+
export function mapTranscriptionResultToState(
|
|
53
|
+
currentState: RecognitionState,
|
|
54
|
+
result: TranscriptionResultV1,
|
|
55
|
+
isRecording: boolean
|
|
56
|
+
): RecognitionState {
|
|
57
|
+
const newState = { ...currentState };
|
|
58
|
+
|
|
59
|
+
// Update recording status if actively recording
|
|
60
|
+
if (isRecording && currentState.startRecordingStatus !== RecordingStatus.FINISHED) {
|
|
61
|
+
newState.startRecordingStatus = RecordingStatus.RECORDING;
|
|
62
|
+
|
|
63
|
+
// Set start timestamp on first audio
|
|
64
|
+
if (!newState.startRecordingTimestamp) {
|
|
65
|
+
newState.startRecordingTimestamp = new Date().toISOString();
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// Update transcription status
|
|
70
|
+
if (!result.is_finished) {
|
|
71
|
+
// Has pending transcript - STEP 2 support
|
|
72
|
+
newState.transcriptionStatus = TranscriptionStatus.IN_PROGRESS;
|
|
73
|
+
|
|
74
|
+
// Direct copy of pending transcript without any combination
|
|
75
|
+
newState.pendingTranscript = result.pendingTranscript || "";
|
|
76
|
+
|
|
77
|
+
// Direct copy of pending confidence
|
|
78
|
+
if (result.pendingTranscriptConfidence !== undefined) {
|
|
79
|
+
newState.pendingConfidence = result.pendingTranscriptConfidence;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Also update final transcript if we have it (even if not finished)
|
|
83
|
+
if (result.finalTranscript) {
|
|
84
|
+
newState.finalTranscript = result.finalTranscript;
|
|
85
|
+
if (result.finalTranscriptConfidence !== undefined) {
|
|
86
|
+
newState.finalConfidence = result.finalTranscriptConfidence;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
} else {
|
|
90
|
+
// Transcription is finished
|
|
91
|
+
newState.transcriptionStatus = TranscriptionStatus.FINALIZED;
|
|
92
|
+
newState.finalTranscript = result.finalTranscript || "";
|
|
93
|
+
if (result.finalTranscriptConfidence !== undefined) {
|
|
94
|
+
newState.finalConfidence = result.finalTranscriptConfidence;
|
|
95
|
+
}
|
|
96
|
+
newState.finalTranscriptionTimestamp = new Date().toISOString();
|
|
97
|
+
|
|
98
|
+
// Clear pending when we have final
|
|
99
|
+
newState.pendingTranscript = "";
|
|
100
|
+
newState.pendingConfidence = undefined;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return newState;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Maps metadata result to update state timestamps
|
|
108
|
+
*/
|
|
109
|
+
export function mapMetadataToState(
|
|
110
|
+
currentState: RecognitionState,
|
|
111
|
+
metadata: MetadataResultV1
|
|
112
|
+
): RecognitionState {
|
|
113
|
+
const newState = { ...currentState };
|
|
114
|
+
|
|
115
|
+
// Update final recording timestamp when metadata arrives
|
|
116
|
+
if (!newState.finalRecordingTimestamp) {
|
|
117
|
+
newState.finalRecordingTimestamp = new Date().toISOString();
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
// Recording is finished when metadata arrives
|
|
121
|
+
newState.startRecordingStatus = RecordingStatus.FINISHED;
|
|
122
|
+
|
|
123
|
+
return newState;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Maps error to state
|
|
128
|
+
*/
|
|
129
|
+
export function mapErrorToState(
|
|
130
|
+
currentState: RecognitionState,
|
|
131
|
+
error: ErrorResultV1
|
|
132
|
+
): RecognitionState {
|
|
133
|
+
return {
|
|
134
|
+
...currentState,
|
|
135
|
+
transcriptionStatus: TranscriptionStatus.ERROR,
|
|
136
|
+
startRecordingStatus: RecordingStatus.FINISHED,
|
|
137
|
+
finalRecordingTimestamp: new Date().toISOString()
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Creates initial VGF state from client config
|
|
143
|
+
*/
|
|
144
|
+
export function createVGFStateFromConfig(config: IRecognitionClientConfig): RecognitionState {
|
|
145
|
+
const audioUtteranceId = config.audioUtteranceId || generateUUID();
|
|
146
|
+
const state = createInitialRecognitionState(audioUtteranceId);
|
|
147
|
+
|
|
148
|
+
// Store ASR config as JSON if provided
|
|
149
|
+
if (config.asrRequestConfig) {
|
|
150
|
+
state.asrConfig = JSON.stringify(config.asrRequestConfig);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
return state;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Updates state when recording stops
|
|
158
|
+
*/
|
|
159
|
+
export function updateStateOnStop(currentState: RecognitionState): RecognitionState {
|
|
160
|
+
return {
|
|
161
|
+
...currentState,
|
|
162
|
+
startRecordingStatus: RecordingStatus.FINISHED,
|
|
163
|
+
finalRecordingTimestamp: new Date().toISOString()
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Updates state when client becomes ready
|
|
169
|
+
*/
|
|
170
|
+
export function updateStateOnReady(currentState: RecognitionState): RecognitionState {
|
|
171
|
+
return {
|
|
172
|
+
...currentState,
|
|
173
|
+
startRecordingStatus: RecordingStatus.READY
|
|
174
|
+
};
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Parses function call from transcript (STEP 3 support)
|
|
179
|
+
* This is a placeholder - actual implementation would use NLP/LLM
|
|
180
|
+
*/
|
|
181
|
+
export function extractFunctionCallFromTranscript(
|
|
182
|
+
transcript: string,
|
|
183
|
+
gameContext?: any
|
|
184
|
+
): { metadata?: string; confidence?: number } | null {
|
|
185
|
+
// This would be replaced with actual function call extraction logic
|
|
186
|
+
// For example, using an LLM to parse intent from the transcript
|
|
187
|
+
// and map it to game actions
|
|
188
|
+
|
|
189
|
+
// Example stub implementation:
|
|
190
|
+
const lowerTranscript = transcript.toLowerCase();
|
|
191
|
+
|
|
192
|
+
// Simple pattern matching for demo
|
|
193
|
+
if (lowerTranscript.includes("play") && lowerTranscript.includes("artist")) {
|
|
194
|
+
return {
|
|
195
|
+
metadata: JSON.stringify({ action: "play", target: "artist" }),
|
|
196
|
+
confidence: 0.8
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/**
|
|
204
|
+
* Updates state with function call results (STEP 3)
|
|
205
|
+
*/
|
|
206
|
+
export function updateStateWithFunctionCall(
|
|
207
|
+
currentState: RecognitionState,
|
|
208
|
+
functionCall: { metadata?: string; confidence?: number }
|
|
209
|
+
): RecognitionState {
|
|
210
|
+
return {
|
|
211
|
+
...currentState,
|
|
212
|
+
functionCallMetadata: functionCall.metadata,
|
|
213
|
+
functionCallConfidence: functionCall.confidence,
|
|
214
|
+
finalFunctionCallTimestamp: new Date().toISOString()
|
|
215
|
+
};
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Helper function to generate UUID (simplified version)
|
|
219
|
+
function generateUUID(): string {
|
|
220
|
+
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
|
|
221
|
+
const r = Math.random() * 16 | 0;
|
|
222
|
+
const v = c === 'x' ? r : (r & 0x3 | 0x8);
|
|
223
|
+
return v.toString(16);
|
|
224
|
+
});
|
|
225
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { z } from "zod"
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* VGF-style state schema for game-side recognition state/results management.
|
|
5
|
+
*
|
|
6
|
+
* This schema provides a standardized way for game developers to manage
|
|
7
|
+
* voice recognition state and results in their applications. It supports:
|
|
8
|
+
*
|
|
9
|
+
* STEP 1: Basic transcription flow
|
|
10
|
+
* STEP 2: Mic auto-stop upon correct answer (using partial transcripts)
|
|
11
|
+
* STEP 3: Semantic/function-call outcomes for game actions
|
|
12
|
+
*
|
|
13
|
+
* Ideally this should be part of a more centralized shared type library to free
|
|
14
|
+
* game developers and provide helper functions (VGF? Platform SDK?).
|
|
15
|
+
*/
|
|
16
|
+
export const RecognitionVGFStateSchema = z.object({
|
|
17
|
+
// Core STT state
|
|
18
|
+
audioUtteranceId: z.string(),
|
|
19
|
+
startRecordingStatus: z.string().optional(), // "NOT_READY", "READY", "RECORDING", "FINISHED". States follow this order.
|
|
20
|
+
// Streaming should only start when "READY". Other states control mic UI and recording.
|
|
21
|
+
transcriptionStatus: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "FINALIZED", "ERROR"
|
|
22
|
+
finalTranscript: z.string().optional(), // Full finalized transcript for the utterance. Will not change.
|
|
23
|
+
finalConfidence: z.number().optional(),
|
|
24
|
+
|
|
25
|
+
// Tracking-only metadata
|
|
26
|
+
asrConfig: z.string().optional(), // Json format of the ASR config
|
|
27
|
+
startRecordingTimestamp: z.string().optional(), // Start of recording. Immutable after set.
|
|
28
|
+
finalRecordingTimestamp: z.string().optional(), // End of recording. Immutable after set. Transcription may still be in progress.
|
|
29
|
+
finalTranscriptionTimestamp: z.string().optional(), // When the final transcript was produced. Immutable after set.
|
|
30
|
+
|
|
31
|
+
// STEP 2: Support for mic auto-stop upon correct answer
|
|
32
|
+
pendingTranscript: z.string().optional().default(""), // Non-final transcript that may change (matches existing naming)
|
|
33
|
+
pendingConfidence: z.number().optional(),
|
|
34
|
+
|
|
35
|
+
// STEP 3: Support for semantic/function-call outcomes
|
|
36
|
+
functionCallMetadata: z.string().optional(), // Function call metadata in JSON, e.g. "{artist: true, title: true}"
|
|
37
|
+
functionCallConfidence: z.number().optional(), // Confidence score for the function call.
|
|
38
|
+
finalFunctionCallTimestamp: z.string().optional(), // When the final action after interpreting the transcript was taken. Immutable.
|
|
39
|
+
|
|
40
|
+
// Support for prompt slot mapping - passed to recognition context when present
|
|
41
|
+
promptSlotMap: z.record(z.string(), z.array(z.string())).optional(), // Optional map of slot names to prompt values for recognition context
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
export type RecognitionState = z.infer<typeof RecognitionVGFStateSchema>
|
|
45
|
+
|
|
46
|
+
// Status constants for better type safety and consistency
|
|
47
|
+
export const RecordingStatus = {
|
|
48
|
+
NOT_READY: "NOT_READY",
|
|
49
|
+
READY: "READY",
|
|
50
|
+
RECORDING: "RECORDING",
|
|
51
|
+
FINISHED: "FINISHED",
|
|
52
|
+
} as const
|
|
53
|
+
|
|
54
|
+
export type RecordingStatusType = typeof RecordingStatus[keyof typeof RecordingStatus]
|
|
55
|
+
|
|
56
|
+
export const TranscriptionStatus = {
|
|
57
|
+
NOT_STARTED: "NOT_STARTED",
|
|
58
|
+
IN_PROGRESS: "IN_PROGRESS",
|
|
59
|
+
FINALIZED: "FINALIZED",
|
|
60
|
+
ERROR: "ERROR",
|
|
61
|
+
} as const
|
|
62
|
+
|
|
63
|
+
export type TranscriptionStatusType = typeof TranscriptionStatus[keyof typeof TranscriptionStatus]
|
|
64
|
+
|
|
65
|
+
// Helper function to create initial state
|
|
66
|
+
export function createInitialRecognitionState(audioUtteranceId: string): RecognitionState {
|
|
67
|
+
return {
|
|
68
|
+
audioUtteranceId,
|
|
69
|
+
startRecordingStatus: RecordingStatus.NOT_READY,
|
|
70
|
+
transcriptionStatus: TranscriptionStatus.NOT_STARTED,
|
|
71
|
+
pendingTranscript: "",
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Helper function to validate state transitions
|
|
76
|
+
export function isValidRecordingStatusTransition(from: string | undefined, to: string): boolean {
|
|
77
|
+
const statusOrder = [
|
|
78
|
+
RecordingStatus.NOT_READY,
|
|
79
|
+
RecordingStatus.READY,
|
|
80
|
+
RecordingStatus.RECORDING,
|
|
81
|
+
RecordingStatus.FINISHED,
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
const fromIndex = from ? statusOrder.indexOf(from as RecordingStatusType) : -1
|
|
85
|
+
const toIndex = statusOrder.indexOf(to as RecordingStatusType)
|
|
86
|
+
|
|
87
|
+
// Can only move forward in the status order
|
|
88
|
+
return toIndex > fromIndex && toIndex !== -1
|
|
89
|
+
}
|